def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Creates base genome profile.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) configuration file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument('-t', '--tmp', help="""The location for temporary files. If not specified, program-specific temp locations are used.""", default=None) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="""If this flag is present, existing files will be overwritten.""", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given, then all intermediate files will be kept; otherwise, they will be deleted. This feature is implemented piecemeal. If the --do-not-call flag is given, then nothing will be deleted.""", action='store_true') logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) pgrm_utils.add_flexbar_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv)) logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'remove-multimapping-reads' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) call = not args.do_not_call keep_delete_files = args.keep_intermediate_files or args.do_not_call # Step 0: Running flexbar to remove adapter sequences raw_data = args.raw_data flexbar_target = filenames.get_without_adapters_base(config['riboseq_data'], args.name, note=note) without_adapters = filenames.get_without_adapters_fastq(config['riboseq_data'], args.name, note=note) adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence', 'adapter-seq') adapter_file_str = utils.get_config_argument(config, 'adapter_file', 'adapters') # get all options, command line options override defaults flexbar_option_str = pgrm_utils.get_final_args(flexbar_options, args.flexbar_options) cmd = "flexbar -r {} -t {} {} {} {} -n {}".format(raw_data, flexbar_target, adapter_seq_str, adapter_file_str, flexbar_option_str, args.num_cpus) in_files = [raw_data] out_files = [without_adapters] file_checkers = { without_adapters: fastx_utils.check_fastq_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # Step 1: Running bowtie2 to remove rRNA alignments out = utils.abspath("dev", "null") # we do not care about the alignments without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'], args.name, note=note) with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'], args.name, note=note) cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format( args.num_cpus, config['ribosomal_index'], without_adapters, out, without_rrna, with_rrna) in_files = [without_adapters] in_files.extend(pgrm_utils.get_bowtie2_index_files(config['ribosomal_index'])) out_files = [without_rrna, with_rrna] to_delete = [without_adapters] file_checkers = { without_rrna: fastx_utils.check_fastq_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete) # Step 2: Running STAR to align rRNA-depleted reads to genome star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'], args.name, note=note) genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam") # get all options, command line options override defaults mem_bytes = utils.human2bytes(args.mem) star_options['limitBAMsortRAM'] = mem_bytes if args.tmp is not None: star_tmp_name = str(args.name + "_STARtmp") star_tmp_dir = pgrm_utils.create_star_tmp(args.tmp, star_tmp_name) star_options['outTmpDir'] = star_tmp_dir star_option_str = pgrm_utils.get_final_args(star_options, args.star_options) # If GFF3 specs, then we need to inform STAR. # Whether we have de novo or not, the format of "config['gtf']" has precedence. sjdb_gtf_tag_str = "" use_gff3_specs = config['gtf'].endswith('gff') gtf_file = filenames.get_gtf(config['genome_base_path'], config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True) if use_gff3_specs: sjdb_gtf_tag_str = "--sjdbGTFtagExonParentTranscript Parent" cmd = ("{} --runThreadN {} --genomeDir {} --sjdbGTFfile {} {} --readFilesIn {} " "{} --outFileNamePrefix {}".format(args.star_executable, args.num_cpus, config['star_index'], gtf_file, sjdb_gtf_tag_str, without_rrna, star_option_str, star_output_prefix)) in_files = [without_rrna] in_files.extend(pgrm_utils.get_star_index_files(config['star_index'])) to_delete = [without_rrna] out_files = [genome_star_bam] file_checkers = { genome_star_bam: bam_utils.check_bam_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete) # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'], args.name, note=note) if os.path.exists(genome_star_bam): shell_utils.create_symlink(genome_star_bam, genome_sorted_bam, call) else: msg = ("Could not find the STAR genome bam alignment file. Unless " "--do-not-call was given, this is a problem.") logger.warning(msg) # create the bamtools index cmd = "samtools index -b {}".format(genome_sorted_bam) shell_utils.check_call(cmd, call=call) # check if we want to keep multimappers if 'keep_riboseq_multimappers' in config: return # remove multimapping reads from the genome file tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=True, note=note) cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam, unique_genome_filename, tmp_str) in_files = [genome_sorted_bam] out_files = [unique_genome_filename] to_delete = [genome_star_bam, genome_sorted_bam] file_checkers = { unique_genome_filename: bam_utils.check_bam_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False): """ Process a GTF file into its ORFs. """ call = not args.do_not_call chr_name_file = os.path.join(config['star_index'], 'chrName.txt') chr_name_str = "--chr-name-file {}".format(chr_name_file) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) # extract a BED12 of the annotated ORFs transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=is_annotated, is_de_novo=is_de_novo) cmd = ("gtf-to-bed12 {} {} {} {} {}".format(gtf, transcript_bed, chr_name_str, cpus_str, logging_str)) in_files = [gtf] out_files = [transcript_bed] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # extract the transcript fasta transcript_fasta = filenames.get_transcript_fasta( config['genome_base_path'], config['genome_name'], is_annotated=is_annotated, is_de_novo=is_de_novo) cmd = ("extract-bed-sequences {} {} {} {}".format(transcript_bed, config['fasta'], transcript_fasta, logging_str)) in_files = [transcript_bed, config['fasta']] out_files = [transcript_fasta] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # extract ORFs from the transcripts using genomic coordinates orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo) start_codons_str = utils.get_config_argument(config, 'start_codons', default=default_start_codons) stop_codons_str = utils.get_config_argument(config, 'stop_codons', default=default_stop_codons) cmd = "extract-orf-coordinates {} {} {} {} {} {} {}".format( transcript_bed, transcript_fasta, orfs_genomic, cpus_str, start_codons_str, stop_codons_str, logging_str) in_files = [transcript_fasta, transcript_bed] out_files = [orfs_genomic] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # write the ORF exons, used to label the ORFs exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo) cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format( orfs_genomic, exons_file, args.num_cpus, logging_str)) in_files = [orfs_genomic] out_files = [exons_file] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # label the ORFs labeled_orfs = filenames.get_labels(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo) annotated_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) orf_exons_str = '--orf-exons {}'.format(exons_file) de_novo_str = "" if is_de_novo: de_novo_str = '--label-prefix "novel_" --filter --nonoverlapping-label "novel"' cmd = "label-orfs {} {} {} {} {} {} {}".format(annotated_bed, orfs_genomic, labeled_orfs, orf_exons_str, de_novo_str, logging_str, cpus_str) in_files = [annotated_bed, orfs_genomic, exons_file] # ** this function overwrites the input file `orfs_genomic` out_files = [labeled_orfs] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script identifies the orf peptide matches for all samples in " "a project.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--peptide-filter-field', help="The field to use for " "filtering the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value " "greater than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the " "peptide file", default=default_peptide_separator) parser.add_argument( '--note', help="If this option is given, it will be used in " "the output filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) call = not args.do_not_call programs = ['get-orf-peptide-matches'] shell_utils.check_programs_exist(programs) required_keys = [ 'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note args_dict = vars(args) peptide_filter_field_str = utils.get_config_argument( args_dict, 'peptides_filter_field') peptide_filter_value_str = utils.get_config_argument( args_dict, 'peptides_filter_value') peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator') num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus') cell_types = ribo_utils.get_riboseq_cell_type_samples(config) for cell_type, peptide_files in config['peptide_cell_type_analysis'].items( ): if cell_type not in cell_types: msg = ( "Could not find cell_type specification. Please check the config " "file: {}".format(cell_type)) logger.warning(msg) continue cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein( config['riboseq_data'], cell_type, is_filtered=True, note=note_str) if not os.path.exists(cell_type_protein): msg = ("Could not find cell_type protein fasta. Skipping: {}". format(cell_type_protein)) logger.warning(msg) continue for peptide_file in peptide_files: if peptide_file not in config['peptide_files']: msg = ( "Could not find peptide_file specification. Please check " "the config file: {}".format(peptide_file)) logger.warning(msg) continue peptide_txt_file = config['peptide_files'][peptide_file] if not os.path.exists(peptide_txt_file): msg = ("Could not find peptide.txt file. Skipping: {}".format( peptide_txt_file)) logger.warning(msg) continue peptide_matches = ribo_filenames.get_riboseq_peptide_matches( config['riboseq_data'], cell_type, peptide_file, is_filtered=True, note=out_note_str) cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format( cell_type_protein, peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, peptide_filter_value_str, peptide_separator_str, logging_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script runs all of the processing necessary to produce the signals used for ORF translation prediction. In particular, it creates the metagene profiles, selected the periodic fragments and generate the ORF profiles.""") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) configuration file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument('--tmp', help="The location for temp files", default=None) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="""If this flag is present, existing files will be overwritten.""", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given, then all intermediate files will be kept; otherwise, they will be deleted. This feature is implemented piecemeal. If the --do-not-call flag is given, then nothing will be deleted.""", action='store_true') logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) pgrm_utils.add_flexbar_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) models_base = config.get('models_base', default_models_base) logging_str = logging_utils.get_logging_options_string(args) star_str = pgrm_utils.get_star_options_string(args) flexbar_str = pgrm_utils.get_flexbar_options_string(args) # handle do_not_call so that we do call the preprocessing script, # but that it does not run anything call = not args.do_not_call do_not_call_argument = "" if not call: do_not_call_argument = "--do-not-call" overwrite_argument = "" if args.overwrite: overwrite_argument = "--overwrite" keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) mem_str = "--mem {}".format(shlex.quote(args.mem)) # check if we want to keep multimappers is_unique = not ('keep_riboseq_multimappers' in config) riboseq_raw_data = args.raw_data riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format( riboseq_raw_data, args.config, args.name, args.num_cpus, do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str, flexbar_str, keep_intermediate_str, mem_str)) # There could be cases where we start somewhere in the middle of creating # the base genome profile. So even if the "raw data" is not available, # we still want to call the base pipeline. # in_files = [riboseq_raw_data] in_files = [] out_files = [riboseq_bam_filename] # we always call this, and pass --do-not-call through shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # Extract the metagene profiles start_upstream_str = utils.get_config_argument(config, 'metagene_start_upstream', 'start-upstream', default=metagene_options['metagene_start_upstream']) start_downstream_str = utils.get_config_argument(config, 'metagene_start_downstream', 'start-downstream', default=metagene_options['metagene_start_downstream']) end_upstream_str = utils.get_config_argument(config, 'metagene_end_upstream', 'end-upstream', default=metagene_options['metagene_end_upstream']) end_downstream_str = utils.get_config_argument(config, 'metagene_end_downstream', 'end-downstream', default=metagene_options['metagene_end_downstream']) metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], args.name, is_unique=is_unique, note=note) # use the canonical transcripts for extracting the metagene profiles transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format( riboseq_bam_filename, transcript_bed, metagene_profiles, args.num_cpus, logging_str, start_upstream_str, start_downstream_str, end_upstream_str, end_downstream_str)) in_files = [riboseq_bam_filename, transcript_bed] out_files = [metagene_profiles] file_checkers = { metagene_profiles: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # estimate the periodicity for each offset for all read lengths metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors( config['riboseq_data'], args.name, is_unique=is_unique, note=note) periodic_models = filenames.get_models(models_base, 'periodic') non_periodic_models = filenames.get_models(models_base, 'nonperiodic') periodic_models_str = ' '.join(periodic_models) non_periodic_models_str = ' '.join(non_periodic_models) periodic_models_str = "--periodic-models {}".format(periodic_models_str) non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str) periodic_offset_start_str = utils.get_config_argument(config, 'periodic_offset_start', default=metagene_options['periodic_offset_start']) periodic_offset_end_str = utils.get_config_argument(config, 'periodic_offset_end', default=metagene_options['periodic_offset_end']) metagene_profile_length_str = utils.get_config_argument(config, 'metagene_profile_length', default=metagene_options['metagene_profile_length']) seed_str = utils.get_config_argument(config, 'seed', default=metagene_options['seed']) chains_str = utils.get_config_argument(config, 'chains', default=metagene_options['chains']) iterations_str = utils.get_config_argument(config, 'metagene_iterations', 'iterations', default=metagene_options['metagene_iterations']) cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} " "{} {} {} {} {} {} {}".format(metagene_profiles, metagene_profile_bayes_factors, args.num_cpus, periodic_models_str, non_periodic_models_str, periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str, seed_str, chains_str, iterations_str, logging_str)) in_files = [metagene_profiles] in_files.extend(periodic_models) in_files.extend(non_periodic_models) out_files = [metagene_profile_bayes_factors] file_checkers = { metagene_profile_bayes_factors: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # select the best read lengths for constructing the signal periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors, periodic_offsets) in_files = [metagene_profile_bayes_factors] out_files = [periodic_offsets] file_checkers = { periodic_offsets: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, args.name, args.do_not_call, is_unique=is_unique, default_params=metagene_options) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, " "and/or min_metagene_bf_likelihood. Quitting.") logger.critical(msg) return lengths_str = ' '.join(lengths) offsets_str = ' '.join(offsets) seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix') # extract the riboseq profiles for each orf unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format( unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, offsets_str, logging_str, seqname_prefix_str, args.num_cpus)) in_files = [orfs_genomic, exons_file, unique_filename] out_files = [profiles_filename] # todo: implement a file checker for mtx files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Extract the ORF profiles for each specified read length " "and offset independently, creating one sparse matrix file (mtx) for " "each read length. These are then collected into a 'sparse tensor'.") parser.add_argument('config', help="The yaml config file.") parser.add_argument('name', help="The name of either one of the 'riboseq_samples'" "or 'riboseq_biological_replicates' from the config file.") parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses" "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written" "using the matrix market format (base-1 indexing).") parser.add_argument('-c', '--is-condition', help="If this flag is present, " "then 'name' will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') parser.add_argument('--add-ids', help="If this flag is present, " "then orf_ids will be added to the final output.", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) seqname_str = utils.get_config_argument(config, 'seqname_prefix') note = config.get('note', None) orf_note = config.get('orf_note', None) orfs = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=orf_note ) exons = filenames.get_exons( config['genome_base_path'], config['genome_name'], note=orf_note, is_orf=True ) # make sure the necessary files exist required_files = [orfs, exons] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # process one sample or all samples from condition names = [args.name] is_condition_str = "" if args.is_condition: is_condition_str = "--is-condition" riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] job_ids = [] for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # now the relevant files bam = filenames.get_riboseq_bam( config['riboseq_data'], name, is_unique=is_unique, note=note ) # make sure the necessary files exist required_files = [bam] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique ) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return for length, offset in zip(lengths, offsets): lengths_str = "--lengths {}".format(length) offsets_str = "--offsets {}".format(offset) mtx = filenames.get_riboseq_profiles( config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note ) cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format( bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str, cpus_str, logging_str ) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) add_ids_str = "" if args.add_ids: add_ids_str = "--add-ids" cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format( args.config, args.name, args.out, is_condition_str, add_ids_str, logging_str ) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)