def create_figures(config_file, config, name, offsets_df, args): """ This function creates all of the figures in the preprocessing report for the given dataset. """ logging_str = logging_utils.get_logging_options_string(args) note = config.get('note', None) note_str = filenames.get_note_string(note) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) image_type_str = "--image-type {}".format(args.image_type) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) min_read_length_str = "--min-read-length {}".format(min_read_length) max_read_length_str = "--max-read-length {}".format(max_read_length) msg = "{}: Getting and visualizing read length distribution".format(name) logger.info(msg) # all aligned reads genome_bam = filenames.get_riboseq_bam(config['riboseq_data'], name, note=note) # uniquely aligned reads unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], name, is_unique=is_unique, note=note) # the read length counts read_length_distribution = filenames.get_riboseq_read_length_distribution( config['riboseq_data'], name, note=note) # the plots cmd = "get-read-length-distribution {} {} --out {} {}".format( genome_bam, unique_filename, read_length_distribution, logging_str) in_files = [genome_bam, unique_filename] out_files = [read_length_distribution] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # visualize all read counts title = None if 'riboseq_sample_name_map' in config: title = config['riboseq_sample_name_map'].get(name) if title is None: title = "{}{}".format(name, note_str) title_str = "{}, All aligned reads".format(title) title_str = "--title={}".format(shlex.quote(title_str)) # get the basename for the distribution file unique_str = filenames.get_unique_string(False) sample_name = "{}{}{}".format(name, note_str, unique_str) read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) cmd = "plot-read-length-distribution {} {} {} {} {} {}".format( read_length_distribution, sample_name, read_length_distribution_image, title_str, min_read_length_str, max_read_length_str) in_files = [read_length_distribution] out_files = [read_length_distribution_image] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # visualize unique read counts # we already have the title title_str = "{}, Uniquely aligned reads".format(title) title_str = "--title={}".format(shlex.quote(title_str)) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=is_unique, note=note, image_type=args.image_type) # get the basename for the distribution file unique_str = filenames.get_unique_string(True) sample_name = "{}{}{}".format(name, note_str, unique_str) cmd = "plot-read-length-distribution {} {} {} {} {} {}".format( read_length_distribution, sample_name, unique_read_length_distribution_image, title_str, min_read_length_str, max_read_length_str) in_files = [read_length_distribution] out_files = [unique_read_length_distribution_image] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # visualize the metagene profiles msg = "{}: Visualizing metagene profiles and Bayes' factors".format(name) logger.info(msg) metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], name, is_unique=is_unique, note=note) profile_bayes_factor = filenames.get_metagene_profiles_bayes_factors( config['riboseq_data'], name, is_unique=is_unique, note=note) mp_df = pd.read_csv(metagene_profiles) for length in range(min_read_length, max_read_length + 1): mask_length = offsets_df['length'] == length # make sure we had some reads of that length if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # make sure we have enough reads to visualize if length_row[ 'highest_peak_profile_sum'] < args.min_visualization_count: continue # visualize the metagene profile metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) title_str = "{}. length: {}".format(title, length) title_str = "--title {}".format(shlex.quote(title_str)) cmd = ("create-read-length-metagene-profile-plot {} {} {} {}".format( metagene_profiles, length, metagene_profile_image, title_str)) in_files = [metagene_profiles] out_files = [metagene_profile_image] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # and the Bayes' factor if args.show_read_length_bfs: metagene_profile_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) title_str = "Metagene profile Bayes' factors: {}. length: {}".format( title, length) title_str = "--title {}".format(shlex.quote(title_str)) fontsize_str = "--font-size 15" cmd = ("visualize-metagene-profile-bayes-factor {} {} {} {} {}". format(profile_bayes_factor, length, metagene_profile_image, title_str, fontsize_str)) in_files = [profile_bayes_factor] out_files = [metagene_profile_image] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # the orf-type metagene profiles if args.show_orf_periodicity: msg = "{}: Visualizing the ORF type metagene profiles".format(title) logger.info(msg) try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique, default_params=metagene_options) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(name)) logger.error(msg) return orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) profiles = filenames.get_riboseq_profiles(config['riboseq_data'], name, length=lengths, offset=offsets, is_unique=is_unique, note=note) title_str = "{}, ORF-type periodicity".format(title) title_str = "--title {}".format(shlex.quote(title_str)) orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') strand = "+" orf_type_profiles_forward = [ filenames.get_orf_type_profile_image(orf_type_profile_base, orf_type, strand, args.image_type) for orf_type in ribo_utils.orf_types ] strand = "-" orf_type_profiles_reverse = [ filenames.get_orf_type_profile_image(orf_type_profile_base, orf_type, strand, args.image_type) for orf_type in ribo_utils.orf_types ] cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".format( orfs_genomic, profiles, orf_type_profile_base, title_str, image_type_str, logging_str)) in_files = [orfs_genomic, profiles] out_files = orf_type_profiles_forward + orf_type_profiles_reverse shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Collect the individual read length ORF profiles (mtx) created " "by 'create-read-length-orf-profiles' into a single 'sparse tensor'. " "N.B. This script is called by 'create-read-length-orf-profiles', however" "we still call each sample independently for condition, lengths and offsets") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name of either one of the 'riboseq_samples'" "or 'riboseq_biological_replicates' from the config file.") parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses" "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written" "using the matrix market format (base-1 indexing).") parser.add_argument('-c', '--is-condition', help="If this flag is present, " "then 'name' will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') parser.add_argument('--add-ids', help="If this flag is present, " "then orf_ids will be added to the final output.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) note = config.get('note', None) if args.add_ids: orf_note = config.get('orf_note', None) orfs_file = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=orf_note ) orfs = bed_utils.read_bed(orfs_file) names = [args.name] if args.is_condition: riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] # keep a map from the lengths to the combined profiles length_profile_map = {} for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique, default_params=metagene_options ) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return for length, offset in zip(lengths, offsets): mtx = filenames.get_riboseq_profiles( config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note ) mtx = scipy.io.mmread(mtx).tocsr() prior_mtx = length_profile_map.get(length, None) if prior_mtx is None: length_profile_map[length] = mtx else: length_profile_map[length] = prior_mtx + mtx if args.add_ids: with gzip.open(args.out, 'wb') as target_gz: for length, mtx in length_profile_map.items(): mtx = mtx.tocoo() msg = "Writing ORF profiles. length: {}.".format(length) logger.info(msg) for row, col, val in zip(mtx.row, mtx.col, mtx.data): # orf_num are both zero-based, since we are now using coo orf_id = orfs.loc[orfs['orf_num'] == row]['id'].values[0] s = "{} {} {} {} {}\n".format(row, orf_id, col, length, val) target_gz.write(s.encode()) else: with gzip.open(args.out, 'wb') as target_gz: for length, mtx in length_profile_map.items(): mtx = mtx.tocoo() msg = "Writing ORF profiles. length: {}.".format(length) logger.info(msg) for row, col, val in zip(mtx.row, mtx.col, mtx.data): s = "{} {} {} {}\n".format(row, col, length, val) target_gz.write(s.encode())
def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False): """ Process a GTF file into its ORFs. """ call = not args.do_not_call chr_name_file = os.path.join(config['star_index'], 'chrName.txt') chr_name_str = "--chr-name-file {}".format(chr_name_file) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) # extract a BED12 of the annotated ORFs transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=is_annotated, is_de_novo=is_de_novo) cmd = ("gtf-to-bed12 {} {} {} {} {}".format(gtf, transcript_bed, chr_name_str, cpus_str, logging_str)) in_files = [gtf] out_files = [transcript_bed] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # extract the transcript fasta transcript_fasta = filenames.get_transcript_fasta( config['genome_base_path'], config['genome_name'], is_annotated=is_annotated, is_de_novo=is_de_novo) cmd = ("extract-bed-sequences {} {} {} {}".format(transcript_bed, config['fasta'], transcript_fasta, logging_str)) in_files = [transcript_bed, config['fasta']] out_files = [transcript_fasta] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # extract ORFs from the transcripts using genomic coordinates orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo) start_codons_str = utils.get_config_argument(config, 'start_codons', default=default_start_codons) stop_codons_str = utils.get_config_argument(config, 'stop_codons', default=default_stop_codons) cmd = "extract-orf-coordinates {} {} {} {} {} {} {}".format( transcript_bed, transcript_fasta, orfs_genomic, cpus_str, start_codons_str, stop_codons_str, logging_str) in_files = [transcript_fasta, transcript_bed] out_files = [orfs_genomic] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # write the ORF exons, used to label the ORFs exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo) cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format( orfs_genomic, exons_file, args.num_cpus, logging_str)) in_files = [orfs_genomic] out_files = [exons_file] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # label the ORFs labeled_orfs = filenames.get_labels(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo) annotated_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) orf_exons_str = '--orf-exons {}'.format(exons_file) de_novo_str = "" if is_de_novo: de_novo_str = '--label-prefix "novel_" --filter --nonoverlapping-label "novel"' cmd = "label-orfs {} {} {} {} {} {} {}".format(annotated_bed, orfs_genomic, labeled_orfs, orf_exons_str, de_novo_str, logging_str, cpus_str) in_files = [annotated_bed, orfs_genomic, exons_file] # ** this function overwrites the input file `orfs_genomic` out_files = [labeled_orfs] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='''Prepare a reference genome and matching annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation and ORF translation prediction pipeline.''') parser.add_argument('config', help='''The (yaml) configuration file''') parser.add_argument('--overwrite', help='''If this flag is present, existing files will be overwritten.''', action='store_true') slurm.add_sbatch_options(parser, num_cpus=default_num_cpus, mem=default_mem) logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check required callable programs, config keys and files programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) files = [config['gtf'], config['fasta'], config['ribosomal_fasta']] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return call = not args.do_not_call # the rRNA index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format( args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = pgrm_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the ORFs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # we will use these files later in the pipeline annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) labeled_orfs = filenames.get_labels(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) use_gff3_specs = config['gtf'].endswith('gff') gtf_file = filenames.get_gtf(config['genome_base_path'], config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) additional_columns = ['orf_num', 'orf_len', 'orf_type'] fields = bed_utils.bed12_field_names + additional_columns bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + [ 'exon_index', 'transcript_start' ] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) de_novo_labeled_orfs = filenames.get_labels( config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) label_files = [annotated_labeled_orfs, de_novo_labeled_orfs] label_files_str = ' '.join(label_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( labeled_orfs, label_files_str)) logger.info(msg) if call: # not sorted, as is concatenated_bed = bed_utils.concatenate(label_files, sort_bed=False) bed_utils.write_bed(concatenated_bed, labeled_orfs) else: msg = "Skipping concatenation due to --call value" logger.info(msg) # we also need to concat the annotations to inform STAR # there is no particular reason to merge and sort the files, so # we just concatenate them... if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs): cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file)) in_files = [config['gtf'], config['de_novo_gtf']] out_files = [gtf_file] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) else: msg = ( "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)" "for reference and do novo annotations. Symlink to reference annotations created." ) logger.warning(msg) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call) else: # if we do not have a de novo assembly, symlink the files if os.path.exists(annotated_orfs): shell_utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): shell_utils.create_symlink(annotated_exons_file, exons_file, call) if os.path.exists(annotated_labeled_orfs): shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs, call) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script runs all of the processing necessary to produce the signals used for ORF translation prediction. In particular, it creates the metagene profiles, selected the periodic fragments and generate the ORF profiles.""") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) configuration file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument('--tmp', help="The location for temp files", default=None) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="""If this flag is present, existing files will be overwritten.""", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given, then all intermediate files will be kept; otherwise, they will be deleted. This feature is implemented piecemeal. If the --do-not-call flag is given, then nothing will be deleted.""", action='store_true') logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) pgrm_utils.add_flexbar_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) models_base = config.get('models_base', default_models_base) logging_str = logging_utils.get_logging_options_string(args) star_str = pgrm_utils.get_star_options_string(args) flexbar_str = pgrm_utils.get_flexbar_options_string(args) # handle do_not_call so that we do call the preprocessing script, # but that it does not run anything call = not args.do_not_call do_not_call_argument = "" if not call: do_not_call_argument = "--do-not-call" overwrite_argument = "" if args.overwrite: overwrite_argument = "--overwrite" keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) mem_str = "--mem {}".format(shlex.quote(args.mem)) # check if we want to keep multimappers is_unique = not ('keep_riboseq_multimappers' in config) riboseq_raw_data = args.raw_data riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format( riboseq_raw_data, args.config, args.name, args.num_cpus, do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str, flexbar_str, keep_intermediate_str, mem_str)) # There could be cases where we start somewhere in the middle of creating # the base genome profile. So even if the "raw data" is not available, # we still want to call the base pipeline. # in_files = [riboseq_raw_data] in_files = [] out_files = [riboseq_bam_filename] # we always call this, and pass --do-not-call through shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # Extract the metagene profiles start_upstream_str = utils.get_config_argument(config, 'metagene_start_upstream', 'start-upstream', default=metagene_options['metagene_start_upstream']) start_downstream_str = utils.get_config_argument(config, 'metagene_start_downstream', 'start-downstream', default=metagene_options['metagene_start_downstream']) end_upstream_str = utils.get_config_argument(config, 'metagene_end_upstream', 'end-upstream', default=metagene_options['metagene_end_upstream']) end_downstream_str = utils.get_config_argument(config, 'metagene_end_downstream', 'end-downstream', default=metagene_options['metagene_end_downstream']) metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], args.name, is_unique=is_unique, note=note) # use the canonical transcripts for extracting the metagene profiles transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format( riboseq_bam_filename, transcript_bed, metagene_profiles, args.num_cpus, logging_str, start_upstream_str, start_downstream_str, end_upstream_str, end_downstream_str)) in_files = [riboseq_bam_filename, transcript_bed] out_files = [metagene_profiles] file_checkers = { metagene_profiles: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # estimate the periodicity for each offset for all read lengths metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors( config['riboseq_data'], args.name, is_unique=is_unique, note=note) periodic_models = filenames.get_models(models_base, 'periodic') non_periodic_models = filenames.get_models(models_base, 'nonperiodic') periodic_models_str = ' '.join(periodic_models) non_periodic_models_str = ' '.join(non_periodic_models) periodic_models_str = "--periodic-models {}".format(periodic_models_str) non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str) periodic_offset_start_str = utils.get_config_argument(config, 'periodic_offset_start', default=metagene_options['periodic_offset_start']) periodic_offset_end_str = utils.get_config_argument(config, 'periodic_offset_end', default=metagene_options['periodic_offset_end']) metagene_profile_length_str = utils.get_config_argument(config, 'metagene_profile_length', default=metagene_options['metagene_profile_length']) seed_str = utils.get_config_argument(config, 'seed', default=metagene_options['seed']) chains_str = utils.get_config_argument(config, 'chains', default=metagene_options['chains']) iterations_str = utils.get_config_argument(config, 'metagene_iterations', 'iterations', default=metagene_options['metagene_iterations']) cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} " "{} {} {} {} {} {} {}".format(metagene_profiles, metagene_profile_bayes_factors, args.num_cpus, periodic_models_str, non_periodic_models_str, periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str, seed_str, chains_str, iterations_str, logging_str)) in_files = [metagene_profiles] in_files.extend(periodic_models) in_files.extend(non_periodic_models) out_files = [metagene_profile_bayes_factors] file_checkers = { metagene_profile_bayes_factors: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # select the best read lengths for constructing the signal periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors, periodic_offsets) in_files = [metagene_profile_bayes_factors] out_files = [periodic_offsets] file_checkers = { periodic_offsets: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, args.name, args.do_not_call, is_unique=is_unique, default_params=metagene_options) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, " "and/or min_metagene_bf_likelihood. Quitting.") logger.critical(msg) return lengths_str = ' '.join(lengths) offsets_str = ' '.join(offsets) seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix') # extract the riboseq profiles for each orf unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format( unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, offsets_str, logging_str, seqname_prefix_str, args.num_cpus)) in_files = [orfs_genomic, exons_file, unique_filename] out_files = [profiles_filename] # todo: implement a file checker for mtx files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Extract the ORF profiles for each specified read length " "and offset independently, creating one sparse matrix file (mtx) for " "each read length. These are then collected into a 'sparse tensor'.") parser.add_argument('config', help="The yaml config file.") parser.add_argument('name', help="The name of either one of the 'riboseq_samples'" "or 'riboseq_biological_replicates' from the config file.") parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses" "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written" "using the matrix market format (base-1 indexing).") parser.add_argument('-c', '--is-condition', help="If this flag is present, " "then 'name' will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') parser.add_argument('--add-ids', help="If this flag is present, " "then orf_ids will be added to the final output.", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) seqname_str = utils.get_config_argument(config, 'seqname_prefix') note = config.get('note', None) orf_note = config.get('orf_note', None) orfs = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=orf_note ) exons = filenames.get_exons( config['genome_base_path'], config['genome_name'], note=orf_note, is_orf=True ) # make sure the necessary files exist required_files = [orfs, exons] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # process one sample or all samples from condition names = [args.name] is_condition_str = "" if args.is_condition: is_condition_str = "--is-condition" riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] job_ids = [] for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # now the relevant files bam = filenames.get_riboseq_bam( config['riboseq_data'], name, is_unique=is_unique, note=note ) # make sure the necessary files exist required_files = [bam] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique ) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return for length, offset in zip(lengths, offsets): lengths_str = "--lengths {}".format(length) offsets_str = "--offsets {}".format(offset) mtx = filenames.get_riboseq_profiles( config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note ) cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format( bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str, cpus_str, logging_str ) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) add_ids_str = "" if args.add_ids: add_ids_str = "--add-ids" cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format( args.config, args.name, args.out, is_condition_str, add_ids_str, logging_str ) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)