Example #1
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs all of the processing necessary to produce the "
        "signals used for later processing. In particular, it runs the standard "
        "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with "
        "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes "
        "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq "
        "signal for the most-expressed transcripts.")
    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (json) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--mem',
                        help="The amount of RAM to request",
                        default=default_mem)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--tmp',
                        help="The location for temp files",
                        default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    star_index = filenames.get_star_index(config['genome_base_path'],
                                          config['genome_name'],
                                          is_merged=False)

    models_base = config.get('models_base', default_models_base)

    # the first step is the standard riboseq preprocessing

    # handle do_not_call so that we _do_ call the preprocessing script,
    # but that it does not run anything
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = (
        "create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
        .format(riboseq_raw_data, args.config, args.name, args.num_cpus,
                do_not_call_argument, overwrite_argument, logging_str,
                star_str, tmp_str, flexbar_format_option_str,
                keep_intermediate_str, mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available,
    # we still want to call the base pipeline.
    #in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # create the metagene profiles
    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    seqids_to_keep_str = utils.get_config_argument(config, 'seqids_to_keep')
    start_upstream_str = utils.get_config_argument(
        config, 'metagene_profile_start_upstream', 'start-upstream')
    start_downstream_str = utils.get_config_argument(
        config, 'metagene_profile_start_downstream', 'start-downstream')
    end_upstream_str = utils.get_config_argument(
        config, 'metagene_profile_end_upstream', 'end-upstream')
    end_downstream_str = utils.get_config_argument(
        config, 'metagene_profile_end_downstream', 'end-downstream')

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {} {}"
           .format(riboseq_bam_filename, transcript_bed, metagene_profiles,
                   args.num_cpus, logging_str, seqids_to_keep_str,
                   start_upstream_str, start_downstream_str, end_upstream_str,
                   end_downstream_str))

    in_files = [riboseq_bam_filename, orfs_genomic]
    out_files = [metagene_profiles]
    file_checkers = {metagene_profiles: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], args.name, is_unique=is_unique, note=note)

    #periodic_models_str = utils.get_config_argument(config, 'periodic_models')
    #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models')
    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')

    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(
        non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(
        config, 'periodic_offset_start')
    periodic_offset_end_str = utils.get_config_argument(
        config, 'periodic_offset_end')
    metagene_profile_length_str = utils.get_config_argument(
        config, 'metagene_profile_length')
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains')
    iterations_str = utils.get_config_argument(config,
                                               'metagene_profile_iterations',
                                               'iterations')

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(
               metagene_profiles, metagene_profile_bayes_factors,
               args.num_cpus, periodic_models_str, non_periodic_models_str,
               periodic_offset_start_str, periodic_offset_end_str,
               metagene_profile_length_str, seed_str, chains_str,
               iterations_str, logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {metagene_profile_bayes_factors: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(
        metagene_profile_bayes_factors, periodic_offsets)
    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {periodic_offsets: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
        config, args.name, args.do_not_call, is_unique=is_unique)

    if len(lengths) == 0:
        msg = (
            "No periodic read lengths and offsets were found. Try relaxing "
            "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
            "and/or min_metagene_bf_likelihood. Qutting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')

    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = (
        "extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} "
        .format(unique_filename, orfs_genomic, exons_file, profiles_filename,
                lengths_str, offsets_str, logging_str, seqname_prefix_str,
                args.num_cpus))
    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    #todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="")  #filenames.run_riboseq_preprocessing_description)

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--mem',
                        help="The amount of RAM to request",
                        default=default_mem)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('-t',
                        '--tmp',
                        help="The location for temporary files. If not "
                        "specified, program-specific temp locations are used.",
                        default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config))
    call = not args.do_not_call
    keep_delete_files = args.keep_intermediate_files or args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'remove-multimapping-reads'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # Step 0: Running flexbar to remove adapter sequences

    raw_data = args.raw_data
    flexbar_target = filenames.get_without_adapters_base(
        config['riboseq_data'], args.name, note=note)
    without_adapters = filenames.get_without_adapters_fastq(
        config['riboseq_data'], args.name, note=note)

    adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence',
                                                'adapter-seq')
    adapter_file_str = utils.get_config_argument(config, 'adapter_file',
                                                 'adapters')

    quality_format_str = utils.get_config_argument(
        config,
        'quality_format',
        args.flexbar_format_option,
        default=default_quality_format)
    max_uncalled_str = utils.get_config_argument(config,
                                                 'max_uncalled',
                                                 default=default_max_uncalled)
    pre_trim_left_str = utils.get_config_argument(
        config, 'pre_trim_left', default=default_pre_trim_left)

    cmd = "flexbar {} {} {} {} -n {} {} -r {} -t {} {}".format(
        quality_format_str, max_uncalled_str, adapter_seq_str,
        adapter_file_str, args.num_cpus, flexbar_compression_str, raw_data,
        flexbar_target, pre_trim_left_str)
    in_files = [raw_data]
    out_files = [without_adapters]
    file_checkers = {without_adapters: fastx_utils.check_fastq_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # Step 1: Running bowtie2 to remove rRNA alignments
    out = utils.abspath("dev", "null")  # we do not care about the alignments
    without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'],
                                                    args.name,
                                                    note=note)
    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              args.name,
                                              note=note)

    cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format(
        args.num_cpus, config['ribosomal_index'], without_adapters, out,
        without_rrna, with_rrna)
    in_files = [without_adapters]
    in_files.extend(bio.get_bowtie2_index_files(config['ribosomal_index']))
    out_files = [without_rrna, with_rrna]
    to_delete = [without_adapters]
    file_checkers = {without_rrna: fastx_utils.check_fastq_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call,
                                   keep_delete_files=keep_delete_files,
                                   to_delete=to_delete)

    # Step 2: Running STAR to align rRNA-depleted reads to genome
    star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'],
                                                        args.name,
                                                        note=note)
    #transcriptome_bam = "{}{}".format(star_output_prefix, "Aligned.toTranscriptome.out.bam")
    genome_star_bam = "{}{}".format(star_output_prefix,
                                    "Aligned.sortedByCoord.out.bam")

    star_compression_str = "--readFilesCommand {}".format(
        shlex.quote(args.star_read_files_command))

    align_intron_min_str = utils.get_config_argument(
        config,
        'align_intron_min',
        'alignIntronMin',
        default=default_align_intron_min)
    align_intron_max_str = utils.get_config_argument(
        config,
        'align_intron_max',
        'alignIntronMax',
        default=default_align_intron_max)
    out_filter_mismatch_n_max_str = utils.get_config_argument(
        config,
        'out_filter_mismatch_n_max',
        'outFilterMismatchNmax',
        default=default_out_filter_mismatch_n_max)
    out_filter_mismatch_n_over_l_max_str = utils.get_config_argument(
        config,
        'out_filter_mismatch_n_over_l_max',
        'outFilterMismatchNoverLmax',
        default=default_out_filter_mismatch_n_over_l_max)
    out_filter_type_str = utils.get_config_argument(
        config,
        'out_filter_type',
        'outFilterType',
        default=default_out_filter_type)
    out_filter_intron_motifs_str = utils.get_config_argument(
        config,
        'out_filter_intron_motifs',
        'outFilterIntronMotifs',
        default=default_out_filter_intron_motifs)
    out_sam_attributes_str = utils.get_config_argument(
        config,
        'out_sam_attributes',
        'outSAMattributes',
        default=default_out_sam_attributes)

    star_tmp_str = ""
    if args.tmp is not None:
        star_tmp_name = "STAR_rpbp"
        star_tmp_dir = star_utils.create_star_tmp(args.tmp, star_tmp_name)
        star_tmp_str = "--outTmpDir {}".format(star_tmp_dir)

    mem_bytes = utils.human2bytes(args.mem)
    star_mem_str = "--limitBAMsortRAM {}".format(mem_bytes)

    cmd = (
        "{} --runThreadN {} {} --genomeDir {} --sjdbGTFfile {} --readFilesIn {} "
        "{} {} {} {} {} {} {} {} --outFileNamePrefix {} {} {} {}".format(
            args.star_executable, args.num_cpus, star_compression_str,
            config['star_index'], config['gtf'], without_rrna,
            align_intron_min_str, align_intron_max_str,
            out_filter_mismatch_n_max_str, out_filter_type_str,
            out_filter_intron_motifs_str, quant_mode_str,
            out_filter_mismatch_n_over_l_max_str, out_sam_attributes_str,
            star_output_prefix, star_out_str, star_tmp_str, star_mem_str))
    in_files = [without_rrna]
    in_files.extend(star_utils.get_star_index_files(config['star_index']))
    #out_files = [transcriptome_bam, genome_star_bam]
    to_delete = [without_rrna]
    out_files = [genome_star_bam]
    file_checkers = {
        #transcriptome_bam: bam_utils.check_bam_file,
        genome_star_bam: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call,
                                   keep_delete_files=keep_delete_files,
                                   to_delete=to_delete)

    # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline
    genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                                  args.name,
                                                  note=note)

    if os.path.exists(genome_star_bam):
        utils.create_symlink(genome_star_bam, genome_sorted_bam, call)
    else:
        msg = ("Could not find the STAR genome bam alignment file. Unless "
               "--do-not-call was given, this is a problem.")
        logger.warning(msg)

    # create the bamtools index
    cmd = "samtools index -b {}".format(genome_sorted_bam)
    shell_utils.check_call(cmd, call=call)

    # check if we want to keep multimappers
    if 'keep_riboseq_multimappers' in config:
        return

    # remove multimapping reads from the genome file
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                       args.name,
                                                       is_unique=True,
                                                       note=note)

    cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam,
                                                      unique_genome_filename,
                                                      tmp_str)

    in_files = [genome_sorted_bam]
    out_files = [unique_genome_filename]
    to_delete = [genome_star_bam, genome_sorted_bam]
    file_checkers = {unique_genome_filename: bam_utils.check_bam_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call,
                                   keep_delete_files=keep_delete_files,
                                   to_delete=to_delete)
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + [
                'orf_num', 'orf_len', 'orf_type'
            ]
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

    else:
        # finally, make sure our files are named correctly

        if os.path.exists(annotated_orfs):
            utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            utils.create_symlink(annotated_exons_file, exons_file, call)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs the Rp-Bp and Rp-chi pipelines on a given sample. "
        "It requires a YAML config file that includes a number of keys. Please see the "
        "documentation for a complete description.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp',
                        help="The temp directory",
                        default=default_tmp)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="If this flag is present, then only "
                        "the ORF profiles will be created",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path',
        'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # now, check if we want to use slurm
    msg = "use_slurm: {}".format(args.use_slurm)
    logger.debug(msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    note_str = config.get('note', None)

    # the first step is the standard riboseq preprocessing

    # handle do_not_call so that we _do_ call the preprocessing script,
    # but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    # for a sample, we first create its filtered genome profile

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
           .format(args.raw_data, args.config, args.name, args.num_cpus,
                   mem_str, do_not_call_str, overwrite_str, logging_str,
                   star_str, tmp_str, flexbar_format_option_str,
                   keep_intermediate_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(
        args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str,
        logging_str))
    shell_utils.check_call(cmd)
Example #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This is a helper script which submits a set of samples to SLURM. It "
        "can also be used to run a set of samples sequentially. Due to limitations on "
        "the config file specification, all of the samples must use the same reference "
        "indices (i.e., genome sequence, set of ORFs, etc.).")

    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--tmp',
                        help="The temp directory",
                        default=default_tmp)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--merge-replicates',
        help="If this flag is present, then "
        "the ORF profiles from the replicates will be merged before making the final "
        "predictions",
        action='store_true')

    parser.add_argument(
        '--run-replicates',
        help="If this flag is given with the "
        "--merge-replicates flag, then both the replicates *and* the individual "
        "samples will be run. This flag has no effect if --merge-replicates is not "
        "given.",
        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index',
        'genome_base_path', 'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles
    profiles_only_str = ""
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = (
            "The --run-replicates option was given with the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    # collect the job_ids in case we are using slurm and need to merge replicates
    job_ids = []

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp,
                               "{}_{}_rpbp".format(sample_name, note))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, args.config, sample_name, args.num_cpus, tmp_str,
            do_not_call_str, overwrite_str, logging_str, star_str,
            profiles_only_str, flexbar_format_option_str,
            keep_intermediate_str, mem_str)

        job_id = slurm.check_sbatch(cmd, args=args)

        job_ids.append(job_id)

    # now, if we are running the "standard" pipeline, we are finished
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):

        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, condition_name, args.num_cpus, do_not_call_str,
            overwrite_str, logging_str, merge_replicates_str)

        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)