def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script merges either the exons or CDS regions of all transcript " "isoforms into a single \"super gene isoform\". It does this based on the given " "GTF feature type and attribute (with defaults \"CDS\" and \"gene_id\", respectively)." ) parser.add_argument('gtf', help="The GTF file") parser.add_argument('out', help="The output (merged) GTF file") parser.add_argument('--feature-type', help="The type of features to merge", default=default_feature_type) parser.add_argument('--group-attribute', help="The attribute by which the features " "will be merged", default=default_group_attribute) parser.add_argument('--id-format-str', help="The python format string to " "use for creating the \"transcript\" identifiers", default=default_id_format_str) parser.add_argument( '--chr-name-file', help="If this file is specified, it will " "be used to determine the seqname sort order. This should be the " "\"chrName.txt\" file created by STAR. If not present, the transcripts " "will be sorted alphabetically (1, 10, 11, 2, ..., KL568162.1, MT, X, Y).", default=default_chr_name_file) parser.add_argument( '--add-exons', help="If this flag is given, then all features will " "be duplicated, but with the feature type \"exon\". Presumably, this should be given " "when \"CDS\" features are merged, and the resulting GTF file will be used by STAR " "(or anything else expecting \"exon\"s).", action='store_true') parser.add_argument( '-g', '--num-groups', help="The number of groups into which to split " "the features. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Reading GTF file" logger.info(msg) gtf_df = gtf_utils.read_gtf(args.gtf) msg = "Extracting desired features" logger.info(msg) m_feature_type = gtf_df['feature'] == args.feature_type gtf_feature_df = gtf_df[m_feature_type] msg = "Parsing GTF attributes" logger.info(msg) attributes = parallel.apply_parallel_split(gtf_feature_df, args.num_cpus, parse_attributes_group, progress_bar=True, num_groups=args.num_groups) attributes_df = pd.concat(attributes) attributes_df['end'] = attributes_df['end'].astype(int) attributes_df['start'] = attributes_df['start'].astype(int) msg = "Merging isoforms" logger.info(msg) gene_features = attributes_df.groupby(args.group_attribute) merged_genes = parallel.apply_parallel_groups(gene_features, args.num_cpus, merge_gene_group, args.group_attribute, args.id_format_str, progress_bar=True) merged_genes_df = pd.concat(merged_genes) if args.add_exons: merged_exons = merged_genes_df.copy() merged_exons['feature'] = 'exon' merged_genes_df = pd.concat([merged_exons, merged_genes_df]) merged_genes_df['start'] = merged_genes_df['start'].astype(int) # now, sort the merged isoforms # this is a bit of a hack, because it is actually using the sorting routine # for bed data frames # we need a dummy 'id' column for sorting, so just use the attributes merged_genes_df['id'] = merged_genes_df['attributes'] merged_genes_df = bed_utils.sort(merged_genes_df, seqname_order=args.chr_name_file) # last, drop duplicate rows fields = ['seqname', 'source', 'feature', 'start', 'end', 'strand'] merged_genes_df = merged_genes_df.drop_duplicates(subset=fields) gtf_utils.write_gtf(merged_genes_df, args.out, compress=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Extract the ORF profiles for each specified read length " "and offset independently. One sparse matrix file will be created for " "each read length. It then collects the values into a sparse tensor.") parser.add_argument('config', help="The (json) config file") parser.add_argument('name', help="The name for the dataset, used in the " "created files") parser.add_argument('out', help="The (mtx.gz) output file containing the " "ORF profiles and read lengths") parser.add_argument( '-c', '--is-condition', help="If this flag is present, " "then \"name\" will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) seqname_str = utils.get_config_argument(config, 'seqname_prefix') note = config.get('note', None) orf_note = config.get('orf_note', None) orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=orf_note) exons = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=orf_note) # make sure the necessary files exist required_files = [orfs, exons] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # check which samples to process names = [args.name] if args.is_condition: riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] job_ids = [] for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # now the relevant files bam = filenames.get_riboseq_bam(config['riboseq_data'], name, is_unique=is_unique, note=note) # make sure the necessary files exist required_files = [bam] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique) if len(lengths) == 0: msg = ( "No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting." ) logger.critical(msg) return for length, offset in zip(lengths, offsets): lengths_str = "--lengths {}".format(length) offsets_str = "--offsets {}".format(offset) mtx = filenames.get_riboseq_profiles(config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note) cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format( bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str, cpus_str, logging_str) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, collect them into a single file offsets_str = ' '.join(str(o) for o in offsets) lengths_str = ' '.join(str(l) for l in lengths) offsets_str = "--offsets {}".format(offsets_str) lengths_str = "--lengths {}".format(lengths_str) is_condition_str = "" if args.is_condition: is_condition_str = "--is-condition" cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format( args.config, args.name, args.out, is_condition_str, logging_str) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates the plots which detail the basic characteristics " "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if " "possible) a latex report for them.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('out', help="The base output directory for the latex report") parser.add_argument( '--show-unfiltered-orfs', help="If this flag is " "present, bar charts showing the distribution of the types of the " "unfiltered ORF set will be included", action='store_true') parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--image-type', help="The format of the image files. This must be " "a format usable by matplotlib.", default=default_image_type) parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) parser.add_argument( '--show-chisq', help="If this flag is given, then the " "results from Rp-chi will be included in the document; otherwise, they " "will not be created or shown.", action='store_true') parser.add_argument('-t', '--tmp', help="A location for temporary files", default=default_tmp) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-orf-length-distribution-line-graph', 'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles' ] shell_utils.check_programs_exist(programs) required_keys = ['riboseq_data', 'riboseq_samples'] utils.check_keys_exist(config, required_keys) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # by default, we will not include chisq chisq_values = [False] if args.show_chisq: chisq_values = [True, False] filtered_values = [True] if args.show_unfiltered_orfs: filtered_values = [True, False] grouped_values = [True, False] # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create all of the figures create_all_figures(config, args) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) project_name = config.get("project_name", default_project_name) title = "Rp-Bp prediction analysis for {}".format(project_name) abstract = "This document shows the results of the Rp-Bp pipeline analysis." #tex_file = os.path.join(args.out, "prediction-report.tex") tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str) with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract) latex.write(out, "\n") latex.clearpage(out) ### ORF type distributions title = "Predicted ORF type distributions" latex.section(out, title) # first, handle all of the regular datasets sample_names = sorted(config['riboseq_samples'].keys()) # and check if we also have replicates replicate_names = [] if 'riboseq_biological_replicates' in config: replicate_names = sorted( ribo_utils.get_riboseq_replicates(config).keys()) strands = ["+", "-"] i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF types: {}".format(sample_name) is_first = True # first, just dump all of the bar charts to the page it = itertools.product(grouped_values, chisq_values, filtered_values) for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i % 6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i % 6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 6 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) it = itertools.product(grouped_values, chisq_values, filtered_values) is_first = True for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i % 6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i % 6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 6 != 0: latex.clearpage(out) ### ORF type length distributions title = "Predicted ORF type length distributions" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type length distributions: {}".format(sample_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq) if os.path.exists(orf_length_line_graph): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_length_line_graph) logger.debug(msg) if (i > 0) and (i % 4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq) if os.path.exists(orf_length_line_graph): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_length_line_graph) logger.debug(msg) if (i > 0) and (i % 4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "Predicted ORF type metagene profiles" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type metagene profiles: {}".format(sample_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_type_profile) logger.warning(msg) if (i > 0) and (i % 4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF type metagene profiles: {}".format( replicate_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type) if os.path.exists(orf_type_profile): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_type_profile) logger.debug(msg) if (i > 0) and (i % 4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename)
def main(): global profiles_data, profiles_indices, profiles_indptr, profiles_shape global translated_models, untranslated_models global args parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" This script uses Hamiltonian MCMC with Stan to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes as input: (1) a set of regions (ORFs) and their corresponding profiles (2) a "translated" model which gives the probability that a region is translated (3) an "untranslated" model which gives the probability that a region is not translated The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor (using the smoothed profile) and \chi^2 value (using the raw counts) for each ORF. """ ) parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)") parser.add_argument('regions', help="The regions (ORFs) for which predictions will " "be made (BED12+)") parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)") parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi " "square test will be performed for each ORF. This can also be a way to get the counts " "within each of the ORFs.", action='store_true') parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+') parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+') ### filtering options parser.add_argument('--orf-types', help="If values are given, then only orfs with " "those types are processed.", nargs='*', default=default_orf_types) parser.add_argument('--orf-type-field', default=default_orf_type_field) parser.add_argument('--min-length', help="ORFs with length less than this value will not " "be processed", type=int, default=default_min_length) parser.add_argument('--max-length', help="ORFs with length greater than this value will not " "be processed", type=int, default=default_max_length) parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number " "of reads) less than this value will not be processed.", type=float, default=default_min_profile) ### smoothing options parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", type=float, default=default_fraction) parser.add_argument('--reweighting-iterations', help="The number of reweighting " "iterations to use in LOWESS. Please see the statsmodels documentation for a " "detailed description of this parameter.", type=int, default=default_reweighting_iterations) ### MCMC options parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=default_seed) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=default_chains) parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for " "each chain", type=int, default=default_iterations) ### behavior options parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed", type=int, default=default_num_orfs) parser.add_argument('--orf-num-field', default=default_orf_num_field) parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will " "be written in GZip format", action='store_true') parser.add_argument('-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # read in the regions and apply the filters msg = "Reading and filtering ORFs" logger.info(msg) regions = bed_utils.read_bed(args.regions) # by default, keep everything m_filters = np.array([True] * len(regions)) if len(args.orf_types) > 0: m_orf_type = regions[args.orf_type_field].isin(args.orf_types) m_filters = m_orf_type & m_filters # min length if args.min_length > 0: m_min_length = regions['orf_len'] >= args.min_length m_filters = m_min_length & m_filters # max length if args.max_length > 0: m_max_length = regions['orf_len'] <= args.max_length m_filters = m_max_length & m_filters # min profile profiles = scipy.io.mmread(args.profiles).tocsr() profiles_sums = profiles.sum(axis=1) good_orf_nums = np.where(profiles_sums >= args.min_profile) good_orf_nums = set(good_orf_nums[0]) m_profile = regions['orf_num'].isin(good_orf_nums) m_filters = m_profile & m_filters regions = regions[m_filters] if args.num_orfs > 0: regions = regions.head(args.num_orfs) regions = regions.reset_index(drop=True) msg = "Number of regions after filtering: {}".format(len(regions)) logger.info(msg) logger.debug("Reading models") translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models] untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models] profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat) profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices) profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr) profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape) bfs_l = parallel.apply_parallel_split( regions, args.num_cpus, get_all_bayes_factors_args, num_groups=args.num_groups, progress_bar=True ) bfs = pd.concat(bfs_l) # write the results as a bed12+ file bed_utils.write_bed(bfs, args.out)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text.") parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument('--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=default_min_visualization_count) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument('-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=default_tmp) parser.add_argument('--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) if args.note is not default_note: config['note'] = args.note note = config.get('note', None) sample_names = sorted(config['riboseq_samples'].keys()) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc','java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information... create_read_filtering_plots(args.config, config, args) # ... and all the other figures. for name in sample_names: periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) create_figures(args.config, config, name, offsets_df, args) min_metagene_profile_count = config.get( "min_metagene_profile_count", default_min_metagene_profile_count) min_metagene_profile_bayes_factor_mean = config.get( "min_metagene_profile_bayes_factor_mean", default_min_metagene_profile_bayes_factor_mean) max_metagene_profile_bayes_factor_var = config.get( "max_metagene_profile_bayes_factor_var", default_max_metagene_profile_bayes_factor_var) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, width=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format(read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format(read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format(unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format(unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if length_row['highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean: offset_status = "BF mean too small" if length_row['highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var: offset_status = "BF variance too high" if length_row['highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row['highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #title = ("length: {}. P-site offset: {}. \\newline status: {}" #"\n".format(length, offset, offset_status)) #latex.write(out, title, size="scriptsize") title = ("Length: {}. P-site offset: {}. Status: {}\n".format(length, offset, offset_status)) if args.show_read_length_bfs: title = "\scriptsize{" + title + "}" title = "\\multicolumn{2}{c}{" + title + "}" latex.write(out, title) latex.write_row_sep(out) else: latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i%2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i%2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i%2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format(orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i%4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i%4 == 0: latex.end_figure(out) latex.clearpage(out) if (i>0) and (i%4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates the plots which detail the basic characteristics " "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if " "possible) a latex report for them.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('out', help="The base output directory for the latex report") parser.add_argument('--show-unfiltered-orfs', help="If this flag is " "present, bar charts showing the distribution of the types of the " "unfiltered ORF set will be included", action='store_true') parser.add_argument('--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--image-type', help="The format of the image files. This must be " "a format usable by matplotlib.", default=default_image_type) parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) parser.add_argument('--show-chisq', help="If this flag is given, then the " "results from Rp-chi will be included in the document; otherwise, they " "will not be created or shown.", action='store_true') parser.add_argument('-t', '--tmp', help="A location for temporary files", default=default_tmp) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-orf-length-distribution-line-graph', 'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # by default, we will not include chisq chisq_values = [False] if args.show_chisq: chisq_values = [True, False] filtered_values = [True] if args.show_unfiltered_orfs: filtered_values = [True, False] grouped_values = [True, False] # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create all of the figures create_all_figures(config, args) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) project_name = config.get("project_name", default_project_name) title = "Rp-Bp prediction analysis for {}".format(project_name) abstract = "This document shows the results of the Rp-Bp pipeline analysis." #tex_file = os.path.join(args.out, "prediction-report.tex") tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str) with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract) latex.write(out, "\n") latex.clearpage(out) ### ORF type distributions title = "Predicted ORF type distributions" latex.section(out, title) # first, handle all of the regular datasets sample_names = sorted(config['riboseq_samples'].keys()) # and check if we also have replicates replicate_names = [] if 'riboseq_biological_replicates' in config: replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys()) strands = ["+", "-"] i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF types: {}".format(sample_name) is_first = True # first, just dump all of the bar charts to the page it = itertools.product(grouped_values, chisq_values, filtered_values) for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered ) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i%6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i%6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%6 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) it = itertools.product(grouped_values, chisq_values, filtered_values) is_first = True for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered ) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i%6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i%6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%6 != 0: latex.clearpage(out) ### ORF type length distributions title = "Predicted ORF type length distributions" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type length distributions: {}".format(sample_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq ) if os.path.exists(orf_length_line_graph): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i%4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_length_line_graph) logger.debug(msg) if (i > 0) and (i%4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq ) if os.path.exists(orf_length_line_graph): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i%4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_length_line_graph) logger.debug(msg) if (i > 0) and (i%4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "Predicted ORF type metagene profiles" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type metagene profiles: {}".format(sample_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type ) msg = "Looking for image file: {}".format(orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i%4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_type_profile) logger.warning(msg) if (i > 0) and (i%4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF type metagene profiles: {}".format(replicate_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq ) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type ) if os.path.exists(orf_type_profile): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_type_profile) logger.debug(msg) if (i > 0) and (i%4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script converts bam files to bigWig files. It is mostly " "a port of this script: https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py " "by Brad Chapman which avoids a few dependencies.\n\nThe wigToBigWig " "program (from UCSC tools) must be in the path.\n\nN.B. If given, the " "start and end coordinates must be base-0.") parser.add_argument('bam', help="The bam file", nargs='+') parser.add_argument( '-o', '--overwrite', help="If this flag is given, then " "the bigWig file will be created whether it exists or not", action='store_true') parser.add_argument('-c', '--chrom', help="If specified, only alignments " "from this chromosome will be in the output", default=default_chrom) parser.add_argument('-s', '--start', help="If specied, only alignments " "from this position will be in the output", default=default_start) parser.add_argument('-e', '--end', help="If specied, only alignments " "up to this position will be in the output", default=default_end) parser.add_argument('-n', '--normalize', help="If this flag is given, " "then values will be normalized to reads per million", action='store_true') parser.add_argument( '-t', '--use-tempfile', help="If this flag is given, " "then a temp file will be used to avoid permission issues", action='store_true') parser.add_argument('-k', '--keep-wig', help="If this flag is given, then " "the wiggle file will not be deleted", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['wigToBigWig'] shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return parallel.apply_parallel_iter(args.bam, args.num_cpus, bam_to_wiggle, args, progress_bar=True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script constructs the profile for each ORF. It " "first adjusts the mapped read positions to properly align with " "the P-sites. Second, it uses a custom chrom-sweep algorithm to " "find the coverage of each position in each exon of each ORF. Finally, " "the ORF exons are glued together to find the profile of the entire ORF." ) parser.add_argument('bam', help="The bam file including filtered (unique, " "etc.) alignments") parser.add_argument('orfs', help="The (bed12) file containing the ORFs") parser.add_argument('exons', help="The (bed6+2) file containing the exons") parser.add_argument('out', help="The (mtx.gz) output file containing the " "ORF profiles") parser.add_argument( '-l', '--lengths', help="If any values are given, " "then only reads which have those lengths will be included in the " "signal construction.", type=int, default=default_lengths, nargs='*') parser.add_argument( '-o', '--offsets', help="The 5' end of reads will be " "shifted by this amount. There must be one offset value for each " "length (given by the --lengths argument.", type=int, default=default_offsets, nargs='*') parser.add_argument('-k', '--num-exons', help="If k>0, then only the " "first k exons will be processed.", type=int, default=default_num_exons) parser.add_argument( '-g', '--num-groups', help="The number of groups into " "which to split the exons. More groups means the progress bar is " "updated more frequently but incurs more overhead because of the " "parallel calls.", type=int, default=default_num_groups) parser.add_argument('--seqname-prefix', help="If present, this string " "will be prepended to the seqname field of the ORFs.", default=default_seqname_prefix) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) # make sure the number of lengths and offsets match if len(args.lengths) != len(args.offsets): msg = "The number of --lengths and --offsets do not match." raise ValueError(msg) # make sure the necessary files exist required_files = [args.bam, args.orfs, args.exons] msg = "[extract-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Finding P-sites" logger.info(msg) p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets) # we do not need the data frame anymore, so save some memory msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) if len(args.seqname_prefix) > 0: orfs['seqname'] = args.seqname_prefix + orfs['seqname'] exons['seqname'] = args.seqname_prefix + exons['seqname'] if args.num_exons > 0: exons = exons.head(args.num_exons) num_orfs = orfs['orf_num'].max() + 1 max_orf_len = orfs['orf_len'].max() msg = "Adding the ORF index to the exons" logger.info(msg) orf_fields = ['id', 'orf_num'] exons_orfs = exons.merge(orfs[orf_fields], on='id') msg = "Splitting exons and P-sites" logger.info(msg) exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups) exons_dfs = [] psites_dfs = [] for group_index, exon_group in exon_groups: # pull out only the p-sites that come from these chromosomes seqnames = set(exon_group['seqname'].unique()) m_psites = p_sites['seqname'].isin(seqnames) exons_dfs.append(exon_group) psites_dfs.append(p_sites[m_psites]) # we no longer need the full list of psites del p_sites del exons_orfs del exon_groups del exons gc.collect() exons_psites = zip(exons_dfs, psites_dfs) msg = "Finding all P-site intersections" logger.info(msg) sum_profiles = parallel.apply_parallel_iter(exons_psites, args.num_cpus, get_all_p_site_intersections, num_orfs, max_orf_len, progress_bar=True, total=args.num_groups) msg = "Combining the ORF profiles into one matrix" logger.info(msg) f = lambda x, y: x + y sum_profiles = functools.reduce(f, sum_profiles) sum_profiles_lil = sum_profiles.tolil() msg = "Flipping the reverse strand profiles" logger.info(msg) m_reverse = orfs['strand'] == '-' reverse_orfs = orfs[m_reverse] for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()): orf_num = reverse_orf['orf_num'] if sum_profiles[orf_num].sum() == 0: continue orf_len = reverse_orf['orf_len'] dense = utils.to_dense(sum_profiles, orf_num, length=orf_len) dense = dense[::-1] sum_profiles_lil[orf_num, :orf_len] = dense msg = "Writing the sparse matrix to disk" logger.info(msg) math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script runs bowtie2 on all of the provided input files " "using the given index. By default, it does not save the alignments, " "aligned reads or unaligned reads. The respective flags must be given " "to retain the desired entities.") parser.add_argument('index', help="The bowtie2 index") parser.add_argument('out', help="The output directory") parser.add_argument('fastq', help="The fastq files", nargs='+') parser.add_argument('-a', '--alignments', help="If this flag is present, " "the alignments will be present in the output folder", action='store_true') parser.add_argument('--un-gz', help="If this flag is present, then the " "unaligned reads will be present in the output folder", action='store_true') parser.add_argument('--al-gz', help="If this flag is present, then the " "aligned reads will be present in the output folder", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['bowtie2', 'call-program'] shell_utils.check_programs_exist(programs) if not os.path.exists(args.out): if not args.do_not_call: msg = "Creating output directory: {}".format(args.out) logger.info(msg) os.makedirs(args.out) for fastq in args.fastq: basename = utils.get_basename(fastq) out_files = [] out = utils.abspath("dev","null") # we do not care about the alignments out_str = "-S {}".format(out) if args.alignments: n = "{}.bam".format(basename) out = os.path.join(args.out, n) out_str = "-S {}".format(out) out_files.append(out) un_gz_str = "" if args.un_gz: n = "{}.un-al.fastq.gz".format(basename) n = os.path.join(args.out, n) un_gz_str = "--un-gz {}".format(n) out_files.append(n) al_gz_str = "" if args.al_gz: n = "{}.al.fastq.gz".format(basename) n = os.path.join(args.out, n) al_gz_str = "--al-gz {}".format(n) out_files.append(n) cmd = "call-program bowtie2 -p {} --very-fast -x {} -U {} {} {} {}".format( args.num_cpus, args.index, fastq, out_str, un_gz_str, al_gz_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates all of the files necessary for downstream " "analysis performed with the rpbp package.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) # check that the required files are present files = [config['gtf'], config['fasta'], config['ribosomal_fasta']] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # now, check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # the rrna index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = bio.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format( args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = star_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the main orfs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # eventually, we will use these names annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) fields = bed_utils.bed12_field_names + [ 'orf_num', 'orf_len', 'orf_type' ] bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + [ 'exon_index', 'transcript_start' ] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) else: # finally, make sure our files are named correctly if os.path.exists(annotated_orfs): utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): utils.create_symlink(annotated_exons_file, exons_file, call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script identifies the orf peptide matches for all samples in " "a project.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--peptide-filter-field', help="The field to use for " "filtering the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value " "greater than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the " "peptide file", default=default_peptide_separator) parser.add_argument('--note', help="If this option is given, it will be used in " "the output filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call programs = [ 'get-orf-peptide-matches' ] shell_utils.check_programs_exist(programs) required_keys = [ 'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note args_dict = vars(args) peptide_filter_field_str = utils.get_config_argument(args_dict, 'peptides_filter_field') peptide_filter_value_str = utils.get_config_argument(args_dict, 'peptides_filter_value') peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator') num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus') cell_types = ribo_utils.get_riboseq_cell_type_samples(config) for cell_type, peptide_files in config['peptide_cell_type_analysis'].items(): if cell_type not in cell_types: msg = ("Could not find cell_type specification. Please check the config " "file: {}".format(cell_type)) logger.warning(msg) continue cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein( config['riboseq_data'], cell_type, is_filtered=True, note=note_str) if not os.path.exists(cell_type_protein): msg = ("Could not find cell_type protein fasta. Skipping: {}". format(cell_type_protein)) logger.warning(msg) continue for peptide_file in peptide_files: if peptide_file not in config['peptide_files']: msg = ("Could not find peptide_file specification. Please check " "the config file: {}".format(peptide_file)) logger.warning(msg) continue peptide_txt_file = config['peptide_files'][peptide_file] if not os.path.exists(peptide_txt_file): msg = ("Could not find peptide.txt file. Skipping: {}". format(peptide_txt_file)) logger.warning(msg) continue peptide_matches = ribo_filenames.get_riboseq_peptide_matches( config['riboseq_data'], cell_type, peptide_file, is_filtered=True, note=out_note_str) cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(cell_type_protein, peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, peptide_filter_value_str, peptide_separator_str, logging_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This is a helper script which submits a set of samples to SLURM. It " "can also be used to run a set of samples sequentially. Due to limitations on " "the config file specification, all of the samples must use the same reference " "indices (i.e., genome sequence, set of ORFs, etc.).") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument('--flexbar-options', help="A space-delimited list of options to" "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\"" "If specified, flexbar options will override default settings.", nargs='*', type=str) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the pre-processing part of the pipeline will be called, i.e. profiles " "will be created for each sample specified in the config file, but no predictions" "will be made.", action='store_true') parser.add_argument('--merge-replicates', help="If this flag is present, then " "the ORF profiles from the replicates will be merged before making the final " "predictions", action='store_true') parser.add_argument('--run-replicates', help="If this flag is given with the " "--merge-replicates flag, then both the replicates *and* the individual " "samples will be run. This flag has no effect if --merge-replicates is not " "given.", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" args.do_not_call = False overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" mem_str = "--mem {}".format(shlex.quote(args.mem)) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" # check if we only want to create the profiles, in this case # we call run-rpbp-pipeline with the --profiles-only option profiles_only_str = "" if args.profiles_only: args.merge_replicates = False profiles_only_str = "--profiles-only" msg = ("The --profiles-only option was given, this will override --merge-replicates " "and/or --run-replicates, if these options were also given!") logger.info(msg) # if we merge the replicates, then we only use the rpbp script to create # the ORF profiles, but we still make predictions if args.merge_replicates and not args.run_replicates: profiles_only_str = "--profiles-only" if args.run_replicates and not args.merge_replicates: msg = ("The --run-replicates option was given without the --merge-replicates " "option. It will be ignored.") logger.warning(msg) tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_option_str = "" if args.flexbar_options is not None: flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"' for flx_op in args.flexbar_options)) # collect the job_ids in case we are using slurm and need to merge replicates job_ids = [] sample_names = sorted(config['riboseq_samples'].keys()) for sample_name in sample_names: data = config['riboseq_samples'][sample_name] tmp_str = "" if args.tmp is not None: tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note)) tmp_str = "--tmp {}".format(tmp) cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format( data, args.config, sample_name, args.num_cpus, tmp_str, do_not_call_str, overwrite_str, logging_str, star_str, profiles_only_str, flexbar_option_str, keep_intermediate_str, mem_str ) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, if we are running the "standard" pipeline, we are finished if not args.merge_replicates: return # otherwise, we need to merge the replicates for each condition riboseq_replicates = ribo_utils.get_riboseq_replicates(config) merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format( args.config, condition_name, args.num_cpus, do_not_call_str, overwrite_str, logging_str, merge_replicates_str ) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script downloads short read archive runs (i.e., SRR) files " "over ftp. It only requires the run number. It also converts the files from " "the .sra format to .fastq.gz files. It then deletes the .sra file.") parser.add_argument( 'srr', help="A csv file containing the SRR accessions to " "download. Optionally, it can also include whether the samples are paired-" "end or not.") parser.add_argument('outdir', help="The location for the fastq.gz files") parser.add_argument('-a', '--accession-field', help="The name of the column " "containing the SRR identifiers", default=default_accession_field) parser.add_argument('-p', '--paired-field', help="The name of the column " "indicating whether the sample is paired-end", default=default_paired_field) parser.add_argument( '-v', '--paired-values', help="The exact string values in " "the paired-field which indicate the sample is paired-end", nargs="*", default=default_paired_values) parser.add_argument('-s', '--source', help="The server from which the files " "will be downloaded", choices=source_choices, default=default_source) parser.add_argument( '--overwrite', help="If this flag is given, then existing " "files will be re-downloaded. Otherwise, if either the .sra or .fastq.gz " "file already exists, then the sra file will not be downloaded.", action='store_true') parser.add_argument( '--num-downloads-per-connection', help="The number of " "files to download with each open connection. Each connections will be " "closed and re-opened after this many files are downloaded.", type=int, default=default_num_downloads_per_connection) parser.add_argument('--sep', help="The separator in the SRR file", default=default_sep) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['fastq-dump'] shell_utils.check_programs_exist(programs) # check if we want to use slurm if args.use_slurm: msg = ("The --use-slurm option was given, so sbatch will now be used " "to submit to slurm.") logger.warning(msg) cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) # and quit! return msg = "Reading SRR list" logger.info(msg) srr = pd.read_csv(args.srr, sep=args.sep) parallel.apply_parallel_split(srr, args.num_cpus, process_files, args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text." ) parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument( '--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=default_min_visualization_count) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument( '-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=default_tmp) parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) if args.note is not None: config['note'] = args.note # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc', 'java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return config = yaml.load(open(args.config)) if args.note is not default_note: config['note'] = args.note note = config.get('note', None) # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information create_read_filtering_plots(args.config, config, args) min_metagene_profile_count = config.get( "min_metagene_profile_count", default_min_metagene_profile_count) min_metagene_profile_bayes_factor_mean = config.get( "min_metagene_profile_bayes_factor_mean", default_min_metagene_profile_bayes_factor_mean) max_metagene_profile_bayes_factor_var = config.get( "max_metagene_profile_bayes_factor_var", default_max_metagene_profile_bayes_factor_var) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) sample_names = sorted(config['riboseq_samples'].keys()) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, height=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, height=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format( read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format( unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets( config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) create_figures(args.config, config, name, offsets_df, args) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if length_row[ 'highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean: offset_status = "BF mean too small" if length_row[ 'highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var: offset_status = "BF variance too high" if length_row[ 'highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row[ 'highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) title = ("length: {}. P-site offset: {}. \\newline status: {}" "\n".format(length, offset, offset_status)) latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i % 2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i % 4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.end_figure(out) latex.clearpage(out) if (i > 0) and (i % 4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts all of the ORFs from the given transcripts. " "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives " "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG " "output for this script is _very_ verbose. It is not recommended to run this " "script with that logging level.") parser.add_argument('transcripts_bed', help="The bed12 file containing the " "transcript information") parser.add_argument('transcripts_fasta', help="The fasta file containing the " "spliced transcript sequences") parser.add_argument('out', help="The output (bed12+1 gz) file") parser.add_argument('--start-codons', help="A list of codons which will be " "treated as start codons when extracting ORFs", nargs='+', default=default_start_codons) parser.add_argument('--stop-codons', help="A list of codons which will be " "treated as stop codons when extracting ORFs", nargs='+', default=default_stop_codons) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # check if we wanted to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Compiling start and stop codon regular expressions" logger.info(msg) start_codons_re = '|'.join(args.start_codons) stop_codons_re = '|'.join(args.stop_codons) start_codons_re = re.compile(start_codons_re) stop_codons_re = re.compile(stop_codons_re) msg = "Reading transcripts bed file" logger.info(msg) transcripts_bed = bed_utils.read_bed(args.transcripts_bed) msg = "Creating the sequence iterator" logger.info(msg) transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta) transcripts_iter = ((get_transcript(transcript_header, transcripts_bed), transcript_sequence) for (transcript_header, transcript_sequence) in transcripts_fasta) msg = "Finding all ORFs" logger.info(msg) orfs = parallel.apply_parallel_iter(transcripts_iter, args.num_cpus, get_orfs, start_codons_re, stop_codons_re, total=len(transcripts_bed), progress_bar=True) msg = "Joining ORFs in a large data frame" logger.info(msg) orfs = pd.concat(orfs) msg = "Removing duplicate ORFs" logger.info(msg) orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS) msg = "Numbering remaining ORFs" logger.info(msg) orfs['orf_num'] = np.arange(len(orfs)) msg = "Writing ORFs to disk" logger.info(msg) bed_utils.write_bed(orfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script identifies the orf peptide matches for all samples in " "a project.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--peptide-filter-field', help="The field to use for " "filtering the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value " "greater than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the " "peptide file", default=default_peptide_separator) parser.add_argument( '--note', help="If this option is given, it will be used in " "the output filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call programs = ['get-orf-peptide-matches'] shell_utils.check_programs_exist(programs) required_keys = [ 'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note args_dict = vars(args) peptide_filter_field_str = utils.get_config_argument( args_dict, 'peptides_filter_field') peptide_filter_value_str = utils.get_config_argument( args_dict, 'peptides_filter_value') peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator') num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus') cell_types = ribo_utils.get_riboseq_cell_type_samples(config) for cell_type, peptide_files in config['peptide_cell_type_analysis'].items( ): if cell_type not in cell_types: msg = ( "Could not find cell_type specification. Please check the config " "file: {}".format(cell_type)) logger.warning(msg) continue cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein( config['riboseq_data'], cell_type, is_filtered=True, note=note_str) if not os.path.exists(cell_type_protein): msg = ("Could not find cell_type protein fasta. Skipping: {}". format(cell_type_protein)) logger.warning(msg) continue for peptide_file in peptide_files: if peptide_file not in config['peptide_files']: msg = ( "Could not find peptide_file specification. Please check " "the config file: {}".format(peptide_file)) logger.warning(msg) continue peptide_txt_file = config['peptide_files'][peptide_file] if not os.path.exists(peptide_txt_file): msg = ("Could not find peptide.txt file. Skipping: {}".format( peptide_txt_file)) logger.warning(msg) continue peptide_matches = ribo_filenames.get_riboseq_peptide_matches( config['riboseq_data'], cell_type, peptide_file, is_filtered=True, note=out_note_str) cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format( cell_type_protein, peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, peptide_filter_value_str, peptide_separator_str, logging_str) slurm.check_sbatch(cmd, args=args)
def main(): global profiles_data, profiles_indices, profiles_indptr, profiles_shape global translated_models, untranslated_models global args parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" This script uses Hamiltonian MCMC with Stan to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes as input: (1) a set of regions (ORFs) and their corresponding profiles (2) a "translated" model which gives the probability that a region is translated (3) an "untranslated" model which gives the probability that a region is not translated The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor (using the smoothed profile) and \chi^2 value (using the raw counts) for each ORF. """ ) parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)") parser.add_argument('regions', help="The regions (ORFs) for which predictions will " "be made (BED12+)") parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)") parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi " "square test will be performed for each ORF. This can also be a way to get the counts " "within each of the ORFs.", action='store_true') parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+') parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+') ### filtering options parser.add_argument('--orf-types', help="If values are given, then only orfs with " "those types are processed.", nargs='*', default=default_orf_types) parser.add_argument('--orf-type-field', default=default_orf_type_field) parser.add_argument('--min-length', help="ORFs with length less than this value will not " "be processed", type=int, default=default_min_length) parser.add_argument('--max-length', help="ORFs with length greater than this value will not " "be processed", type=int, default=default_max_length) parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number " "of reads) less than this value will not be processed.", type=float, default=default_min_profile) ### smoothing options parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", type=float, default=default_fraction) parser.add_argument('--reweighting-iterations', help="The number of reweighting " "iterations to use in LOWESS. Please see the statsmodels documentation for a " "detailed description of this parameter.", type=int, default=default_reweighting_iterations) ### MCMC options parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=default_seed) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=default_chains) parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for " "each chain", type=int, default=default_iterations) ### behavior options parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed", type=int, default=default_num_orfs) parser.add_argument('--orf-num-field', default=default_orf_num_field) parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will " "be written in GZip format", action='store_true') parser.add_argument('-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # read in the regions and apply the filters msg = "Reading and filtering ORFs" logger.info(msg) regions = bed_utils.read_bed(args.regions) # by default, keep everything m_filters = np.array([True] * len(regions)) if len(args.orf_types) > 0: m_orf_type = regions[args.orf_type_field].isin(args.orf_types) m_filters = m_orf_type & m_filters # min length if args.min_length > 0: m_min_length = regions['orf_len'] >= args.min_length m_filters = m_min_length & m_filters # max length if args.max_length > 0: m_max_length = regions['orf_len'] <= args.max_length m_filters = m_max_length & m_filters # min profile profiles = scipy.io.mmread(args.profiles).tocsr() profiles_sums = profiles.sum(axis=1) good_orf_nums = np.where(profiles_sums >= args.min_profile) good_orf_nums = set(good_orf_nums[0]) m_profile = regions['orf_num'].isin(good_orf_nums) m_filters = m_profile & m_filters regions = regions[m_filters] if args.num_orfs > 0: regions = regions.head(args.num_orfs) regions = regions.reset_index(drop=True) msg = "Number of regions after filtering: {}".format(len(regions)) logger.info(msg) logger.debug("Reading models") translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models] untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models] profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat) profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices) profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr) profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape) with suppress_stdout_stderr(): bfs_l = parallel.apply_parallel_split( regions, args.num_cpus, get_all_bayes_factors_args, num_groups=args.num_groups, progress_bar=True ) bfs = pd.concat(bfs_l) # write the results as a bed12+ file bed_utils.write_bed(bfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script runs the Rp-Bp and Rp-chi pipelines on a given sample. " "It requires a YAML config file that includes a number of keys. Please see the " "documentation for a complete description.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the ORF profiles will be created", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # now, check if we want to use slurm msg = "use_slurm: {}".format(args.use_slurm) logger.debug(msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return note_str = config.get('note', None) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" # for a sample, we first create its filtered genome profile keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(args.raw_data, args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, logging_str, star_str, tmp_str, flexbar_format_option_str, keep_intermediate_str)) shell_utils.check_call(cmd) # check if we only want to create the profiles if args.profiles_only: return # then we predict the ORFs cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format( args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str)) shell_utils.check_call(cmd)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script runs the Rp-Bp and Rp-chi pipelines on a given sample. " "It requires a YAML config file that includes a number of keys. Please see the " "documentation for a complete description.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument('--flexbar-options', help="A space-delimited list of options to" "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\"" "If specified, flexbar options will override default settings.", nargs='*', type=str) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the ORF profiles will be created", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # now, check if we want to use slurm msg = "use_slurm: {}".format(args.use_slurm) logger.debug(msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return note_str = config.get('note', None) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" # for a sample, we first create its filtered genome profile keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_option_str = "" if args.flexbar_options is not None: flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"' for flx_op in args.flexbar_options)) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(args.raw_data, args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, logging_str, star_str, tmp_str, flexbar_option_str, keep_intermediate_str)) shell_utils.check_call(cmd) # check if we only want to create the profiles if args.profiles_only: return # then we predict the ORFs cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str)) shell_utils.check_call(cmd)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates all of the files necessary for downstream " "analysis performed with the rpbp package.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) # check that the required files are present files = [ config['gtf'], config['fasta'], config['ribosomal_fasta'] ] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # now, check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # the rrna index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = bio.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format(args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = star_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the main orfs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # eventually, we will use these names annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False, is_orf=True) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_orf=True) use_gff3_specs = config['gtf'].endswith('gff') gtf_file = filenames.get_gtf(config['genome_base_path'], config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True, is_orf=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type'] bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start'] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) # we also need to concat the annotations to inform STAR # there is no particular reason to merge and sort the files, so # we just concatenate them... if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs): cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file)) in_files = [config['gtf'], config['de_novo_gtf']] out_files = [gtf_file] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) else: msg = ("Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)" "for reference and do novo annotations. Symlink to reference annotations created.") logger.warning(msg) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call) else: # finally, make sure our files are named correctly if os.path.exists(annotated_orfs): shell_utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): shell_utils.create_symlink(annotated_exons_file, exons_file, call) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Extract the ORF profiles for each specified read length " "and offset independently. One sparse matrix file will be created for " "each read length. It then collects the values into a sparse tensor.") parser.add_argument('config', help="The (json) config file") parser.add_argument('name', help="The name for the dataset, used in the " "created files") parser.add_argument('out', help="The (mtx.gz) output file containing the " "ORF profiles and read lengths") parser.add_argument('-c', '--is-condition', help="If this flag is present, " "then \"name\" will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) seqname_str = utils.get_config_argument(config, 'seqname_prefix') note = config.get('note', None) orf_note = config.get('orf_note', None) orfs = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=orf_note ) exons = filenames.get_exons( config['genome_base_path'], config['genome_name'], note=orf_note, is_orf=True ) # make sure the necessary files exist required_files = [orfs, exons] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # check which samples to process names = [args.name] if args.is_condition: riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] job_ids = [] for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # now the relevant files bam = filenames.get_riboseq_bam( config['riboseq_data'], name, is_unique=is_unique, note=note ) # make sure the necessary files exist required_files = [bam] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique ) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return for length, offset in zip(lengths, offsets): lengths_str = "--lengths {}".format(length) offsets_str = "--offsets {}".format(offset) mtx = filenames.get_riboseq_profiles( config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note ) cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format( bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str, cpus_str, logging_str ) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, collect them into a single file offsets_str = ' '.join(str(o) for o in offsets) lengths_str = ' '.join(str(l) for l in lengths) offsets_str = "--offsets {}".format(offsets_str) lengths_str = "--lengths {}".format(lengths_str) is_condition_str = "" if args.is_condition: is_condition_str = "--is-condition" cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format( args.config, args.name, args.out, is_condition_str, logging_str ) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This is a helper script which submits a set of samples to SLURM. It " "can also be used to run a set of samples sequentially. Due to limitations on " "the config file specification, all of the samples must use the same reference " "indices (i.e., genome sequence, set of ORFs, etc.).") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '--merge-replicates', help="If this flag is present, then " "the ORF profiles from the replicates will be merged before making the final " "predictions", action='store_true') parser.add_argument( '--run-replicates', help="If this flag is given with the " "--merge-replicates flag, then both the replicates *and* the individual " "samples will be run. This flag has no effect if --merge-replicates is not " "given.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" args.do_not_call = False overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" mem_str = "--mem {}".format(shlex.quote(args.mem)) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" # if we merge the replicates, then we only use the rpbp script to create # the ORF profiles profiles_only_str = "" if args.merge_replicates and not args.run_replicates: profiles_only_str = "--profiles-only" if args.run_replicates and not args.merge_replicates: msg = ( "The --run-replicates option was given with the --merge-replicates " "option. It will be ignored.") logger.warning(msg) tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) # collect the job_ids in case we are using slurm and need to merge replicates job_ids = [] sample_names = sorted(config['riboseq_samples'].keys()) for sample_name in sample_names: data = config['riboseq_samples'][sample_name] tmp_str = "" if args.tmp is not None: tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note)) tmp_str = "--tmp {}".format(tmp) cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format( data, args.config, sample_name, args.num_cpus, tmp_str, do_not_call_str, overwrite_str, logging_str, star_str, profiles_only_str, flexbar_format_option_str, keep_intermediate_str, mem_str) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, if we are running the "standard" pipeline, we are finished if not args.merge_replicates: return # otherwise, we need to merge the replicates for each condition riboseq_replicates = ribo_utils.get_riboseq_replicates(config) merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format( args.config, condition_name, args.num_cpus, do_not_call_str, overwrite_str, logging_str, merge_replicates_str) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)