def create_read_filtering_plots(config_file, config, args):
        
    # get the filtering counts
    note = config.get('note', None)
    read_filtering_counts = filenames.get_riboseq_read_filtering_counts(
        config['riboseq_data'], note=note)
    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    logging_str = logging_utils.get_logging_options_string(args)

    cpus_str = "--num-cpus {}".format(args.num_cpus)
    cmd = "get-all-read-filtering-counts {} {} {} {} {}".format(
        config_file, 
        read_filtering_counts, 
        overwrite_str, 
        cpus_str, 
        logging_str
    )

    in_files = [config_file]
    out_files = [read_filtering_counts]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True)

    # and visualize them
    read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
        config['riboseq_data'], note=note, image_type=args.image_type)
    
    title = "Read filtering counts"
    title_str = "--title {}".format(shlex.quote(title))
    cmd = "visualize-read-filtering-counts {} {} {}".format(
        read_filtering_counts, 
        read_filtering_image, 
        title_str
    )
    in_files = [read_filtering_counts]
    out_files=[read_filtering_image]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True)

    # and visualize the filtering without the rrna
    n = "no-rrna-{}".format(note)
    read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
        config['riboseq_data'], note=n, image_type=args.image_type)
    
    title = "Read filtering counts, no ribosomal matches"
    title_str = "--title {}".format(shlex.quote(title))
    cmd = "visualize-read-filtering-counts {} {} {} --without-rrna".format(
        read_filtering_counts, 
        read_filtering_image, 
        title_str
    )

    in_files = [read_filtering_counts]
    out_files=[read_filtering_image]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True)
def create_read_filtering_plots(config_file, config, args):

    # get the filtering counts
    note = config.get('note', None)
    read_filtering_counts = filenames.get_riboseq_read_filtering_counts(
        config['riboseq_data'], note=note)
    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    logging_str = logging_utils.get_logging_options_string(args)

    cpus_str = "--num-cpus {}".format(args.num_cpus)
    cmd = "get-all-read-filtering-counts {} {} {} {} {}".format(
        config_file, read_filtering_counts, overwrite_str, cpus_str,
        logging_str)

    in_files = [config_file]
    out_files = [read_filtering_counts]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # and visualize them
    read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
        config['riboseq_data'], note=note, image_type=args.image_type)

    title = "Read filtering counts"
    title_str = "--title {}".format(shlex.quote(title))
    cmd = "visualize-read-filtering-counts {} {} {}".format(
        read_filtering_counts, read_filtering_image, title_str)
    in_files = [read_filtering_counts]
    out_files = [read_filtering_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # and visualize the filtering without the rrna
    n = "no-rrna-{}".format(note)
    read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
        config['riboseq_data'], note=n, image_type=args.image_type)

    title = "Read filtering counts, no ribosomal matches"
    title_str = "--title {}".format(shlex.quote(title))
    cmd = "visualize-read-filtering-counts {} {} {} --without-rrna".format(
        read_filtering_counts, read_filtering_image, title_str)

    in_files = [read_filtering_counts]
    out_files = [read_filtering_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)
def create_figures(config_file, config, name, offsets_df, args):
    """ This function creates all of the figures in the preprocessing report
        for the given dataset.
    """
    logging_str = logging_utils.get_logging_options_string(args)
    note = config.get('note', None)
    
    note_str = filenames.get_note_string(note)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    image_type_str = "--image-type {}".format(args.image_type)

    min_read_length = int(offsets_df['length'].min())
    max_read_length = int(offsets_df['length'].max())

    min_read_length_str = "--min-read-length {}".format(min_read_length)
    max_read_length_str = "--max-read-length {}".format(max_read_length)

    msg = "{}: Getting and visualizing read length distribution".format(name)
    logger.info(msg)

    # all aligned reads
    genome_bam = filenames.get_riboseq_bam(
        config['riboseq_data'], name, note=note)
            
    # uniquely aligned reads
    unique_filename = filenames.get_riboseq_bam(
        config['riboseq_data'], name, is_unique=is_unique, note=note)

    # the read length counts
    read_length_distribution = filenames.get_riboseq_read_length_distribution(
        config['riboseq_data'], name, note=note)
    
    # the plots
    cmd = "get-read-length-distribution {} {} --out {} {}".format(
        genome_bam, 
        unique_filename,
        read_length_distribution, 
        logging_str
    )
    in_files = [genome_bam, unique_filename]
    out_files = [read_length_distribution]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True)

    # visualize all read counts
    title = None
    if 'riboseq_sample_name_map' in config:
        title = config['riboseq_sample_name_map'].get(name)
    if title is None:
        title = "{}{}".format(name, note_str)
    
    title_str = "{}, All aligned reads".format(title)        
    title_str = "--title={}".format(shlex.quote(title_str))

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(False)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'], 
        name, 
        is_unique=False, 
        note=note, 
        image_type=args.image_type
    )
    
    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, 
        sample_name,
        read_length_distribution_image, 
        title_str, 
        min_read_length_str, 
        max_read_length_str
    )

    in_files = [read_length_distribution]
    out_files = [read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True)
    
    # visualize unique read counts

    # we already have the title
    title_str = "{}, Uniquely aligned reads".format(title)        
    title_str = "--title={}".format(shlex.quote(title_str))

    unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'], 
        name, 
        is_unique=is_unique, 
        note=note, 
        image_type=args.image_type
    )

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(True)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, 
        sample_name,
        unique_read_length_distribution_image, 
        title_str, 
        min_read_length_str, 
        max_read_length_str
    )
    in_files = [read_length_distribution]
    out_files = [unique_read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True)

    # visualize the metagene profiles
    msg = "{}: Visualizing metagene profiles and Bayes' factors".format(name)
    logger.info(msg)

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], 
        name, is_unique=is_unique, note=note)
    
    profile_bayes_factor = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'],
        name, 
        is_unique=is_unique, 
        note=note
    )

    mp_df = pd.read_csv(metagene_profiles)

    for length in range(min_read_length, max_read_length+1):

        mask_length = offsets_df['length'] == length

        # make sure we had some reads of that length
        if sum(mask_length) == 0:
            continue
        length_row = offsets_df[mask_length].iloc[0]
               
        # make sure we have enough reads to visualize
        if length_row['highest_peak_profile_sum'] < args.min_visualization_count:
            continue
        
        # visualize the metagene profile
        metagene_profile_image = filenames.get_metagene_profile_image(
            config['riboseq_data'], 
            name, 
            image_type=args.image_type, 
            is_unique=is_unique, 
            length=length, 
            note=note
        )

        title_str = "{}. length: {}".format(title, length)
        title_str = "--title {}".format(shlex.quote(title_str))
        cmd = ("create-read-length-metagene-profile-plot {} {} {} {}".format(
            metagene_profiles, length, metagene_profile_image, title_str))
        in_files = [metagene_profiles]
        out_files = [metagene_profile_image]
        shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
            overwrite=args.overwrite, call=True)

        # and the Bayes' factor
        if args.show_read_length_bfs:
            metagene_profile_image = filenames.get_metagene_profile_bayes_factor_image(
                config['riboseq_data'], 
                name, 
                image_type=args.image_type, 
                is_unique=is_unique, 
                length=length, 
                note=note
            )

            title_str = "Metagene profile Bayes' factors: {}. length: {}".format(title, length)
            title_str = "--title {}".format(shlex.quote(title_str))
            fontsize_str = "--font-size 15"

            cmd = ("visualize-metagene-profile-bayes-factor {} {} {} {} {}".format(
                profile_bayes_factor, 
                length, 
                metagene_profile_image, 
                title_str,
                fontsize_str
            ))

            in_files = [profile_bayes_factor]
            out_files = [metagene_profile_image]
            shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
                overwrite=args.overwrite, call=True)

    # the orf-type metagene profiles
    if args.show_orf_periodicity:
        msg = "{}: Visualizing the ORF type metagene profiles".format(title)
        logger.info(msg)


        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, 
                name, is_unique=is_unique)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                "Skipping".format(name))
            logger.error(msg)
            return


        orfs_genomic = filenames.get_orfs(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'))
             
        profiles = filenames.get_riboseq_profiles(config['riboseq_data'], name, 
                length=lengths, offset=offsets, is_unique=is_unique, note=note)

        title_str = "{}, ORF-type periodicity".format(title)
        title_str = "--title {}".format(shlex.quote(title_str))
        
        orf_type_profile_base = filenames.get_orf_type_profile_base(
            config['riboseq_data'], name, length=lengths, offset=offsets, 
            is_unique=is_unique, note=note, subfolder='orf-profiles')

        strand = "+"
        orf_type_profiles_forward = [
            filenames.get_orf_type_profile_image(
                orf_type_profile_base, 
                orf_type, 
                strand, 
                args.image_type
            )   for orf_type in ribo_utils.orf_types
        ]
        
        strand = "-"
        orf_type_profiles_reverse = [
            filenames.get_orf_type_profile_image(
                orf_type_profile_base, 
                orf_type, 
                strand, 
                args.image_type
            )   for orf_type in ribo_utils.orf_types
        ]

        cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".format(
            orfs_genomic, profiles, orf_type_profile_base, title_str, 
            image_type_str, logging_str))

        in_files = [orfs_genomic, profiles]
        out_files = orf_type_profiles_forward + orf_type_profiles_reverse
        shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
            overwrite=args.overwrite)
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs the second part of the pipeline: it estimate ORF Bayes"
            "factors using the ORF profiles, then make the final prediction set.")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
        type=int, default=default_num_cpus)
    
    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--merge-replicates', help="If this flag is present, then the ORF "
        "profiles will be merged for all replicates in the condition given by <name>. The "
        "filenames, etc., will reflect the condition name, but not the lengths and offsets "
        "of the individual replicates.\n\nN.B. If this flag is is present, the --overwrite "
        "flag will automatically be set!", action='store_true')
        
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[predict_translated_orfs]: {}".format(' '.join(sys.argv))
    logger.debug(msg)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [   
        'estimate-orf-bayes-factors',
        'select-final-prediction-set'
    ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
        'riboseq_data',
        'fasta',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    models_base = config.get('models_base', default_models_base)

    note_str = config.get('note', None)

    # we always need the ORFs
    orfs_genomic = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=config.get('orf_note')
    )

    # smoothing parameters (filenames)
    # default values are not used in the file names
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    # check if we are running Rp-Bp (default) or Rp-chi
    chi_square_only_str = ""
    chi_square_only = False
    if 'chi_square_only' in config:
        chi_square_only_str = "--chi-square-only"
        chi_square_only = True
        fraction = None
        reweighting_iterations = None
        msg = """ The final prediction set will be made based on the chi square test only! 
                  The translation models will not be fit to the data, and the posterior 
                  distributions will not be estimated. """
        logger.info(msg)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # first, check if we are merging replicates

    # either way, the following variables need to have values for the rest of
    # the pipeline: lengths, offsets, smooth_profiles
    if args.merge_replicates:
        msg = ("The --merge-replicates option was given, so --overwrite is "
            "being set to True.")
        logger.warning(msg)
        args.overwrite = True

        # now, actually merge the replicates
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)

        # we will not use the lengths and offsets in the filenames
        lengths = None
        offsets = None

        # we will also merge all of unsmoothed profiles
        replicate_profiles = [
            get_profile(name, config, args) 
                for name in riboseq_replicates[args.name]
        ]

        replicate_profiles_str = ' '.join(replicate_profiles)

        profiles = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, 
            length=lengths, offset=offsets, is_unique=is_unique, note=note_str)

        cmd = "merge-replicate-orf-profiles {} {} {}".format(replicate_profiles_str,
            profiles, logging_str)
        in_files = replicate_profiles
        out_files = [profiles]

        # todo: implement file checker for mtx files
        shell_utils.call_if_not_exists(
            cmd, 
            out_files, 
            in_files=in_files, 
            overwrite=args.overwrite, 
            call=call
        )


    else:
        # otherwise, just treat things as normal
        # get the lengths and offsets which meet the required criteria from 
        # the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, 
            args.name, args.do_not_call, is_unique=is_unique)
        
        profiles = get_profile(args.name, config, args)
        
    # estimate the bayes factors
    bayes_factors = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'], 
        args.name, 
        length=lengths, 
        offset=offsets, 
        is_unique=is_unique, 
        note=note_str, 
        fraction=fraction, 
        reweighting_iterations=reweighting_iterations
    )

    # the smoothing options
    min_length_str = utils.get_config_argument(config, 'min_orf_length', 'min-length')
    max_length_str = utils.get_config_argument(config, 'max_orf_length', 'max-length')
    min_profile_str = utils.get_config_argument(config, 'min_signal', 'min-profile')

    fraction_str = utils.get_config_argument(config, 'smoothing_fraction', 'fraction')
    reweighting_iterations_str = utils.get_config_argument(config, 
        'smoothing_reweighting_iterations', 'reweighting-iterations')
    
    # parse out all of the options from the config file, if they are present
    translated_models = filenames.get_models(models_base, 'translated')
    untranslated_models = filenames.get_models(models_base, 'untranslated')

    translated_models_str = ' '.join(translated_models)
    untranslated_models_str = ' '.join(untranslated_models)

    translated_models_str = "--translated-models {}".format(
        translated_models_str)
    untranslated_models_str = "--untranslated-models {}".format(
        untranslated_models_str)
    
    orf_types_str = utils.get_config_argument(config, 'orf_types')
    
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains', 'chains')
    iterations_str = utils.get_config_argument(config, 'translation_iterations', 'iterations')


    cmd = ("estimate-orf-bayes-factors {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} "
        "--num-cpus {}".format(
        profiles, 
        orfs_genomic, 
        bayes_factors, 
        translated_models_str, 
        untranslated_models_str, 
        logging_str, 
        orf_types_str, 
        min_length_str, 
        max_length_str, 
        min_profile_str, 
        fraction_str, 
        reweighting_iterations_str,
        seed_str, 
        iterations_str, 
        chains_str, 
        chi_square_only_str, 
        args.num_cpus)
    )
    
    in_files = [profiles, orfs_genomic]
    in_files.extend(translated_models)
    in_files.extend(untranslated_models)
    out_files = [bayes_factors]
    file_checkers = {
        bayes_factors: utils.check_gzip_file
    }
    msg = "estimate-bayes-factors in_files: {}".format(in_files)
    logger.debug(msg)
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    for is_filtered in [True, False]:
            
        filtered_str = ""
        if is_filtered:
            filtered_str = "--select-longest-by-stop --select-best-overlapping"

        # now, select the ORFs (longest for each stop codon) which pass the prediction filters
        predicted_orfs = filenames.get_riboseq_predicted_orfs(
            config['riboseq_data'], 
            args.name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=note_str, 
            fraction=fraction, 
            reweighting_iterations=reweighting_iterations,
            is_filtered=is_filtered, 
            is_chisq=chi_square_only
        )

        predicted_orfs_dna = filenames.get_riboseq_predicted_orfs_dna(
            config['riboseq_data'], 
            args.name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=note_str, 
            fraction=fraction, 
            reweighting_iterations=reweighting_iterations,
            is_filtered=is_filtered, 
            is_chisq=chi_square_only
        )

        predicted_orfs_protein = filenames.get_riboseq_predicted_orfs_protein(
            config['riboseq_data'], 
            args.name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=note_str,
            fraction=fraction, 
            reweighting_iterations=reweighting_iterations,
            is_filtered=is_filtered, 
            is_chisq=chi_square_only
        )

        min_bf_mean_str = utils.get_config_argument(config, 'min_bf_mean')
        max_bf_var_str = utils.get_config_argument(config, 'max_bf_var')
        min_bf_likelihood_str = utils.get_config_argument(config, 'min_bf_likelihood')
    
        chisq_significance_level_str = utils.get_config_argument(config, 'chisq_significance_level')
        min_profile_str = utils.get_config_argument(config, 'min_signal', 'minimum-profile-sum')

        cmd = "select-final-prediction-set {} {} {} {} {} {} {} {} {} {} {}".format(
            bayes_factors, 
            config['fasta'], 
            predicted_orfs, 
            predicted_orfs_dna, 
            predicted_orfs_protein,
            min_bf_mean_str, 
            max_bf_var_str, 
            min_bf_likelihood_str, 
            logging_str, 
            chi_square_only_str,
            filtered_str
        )

        in_files = [bayes_factors, config['fasta']]
        out_files = [
            predicted_orfs, 
            predicted_orfs_dna, 
            predicted_orfs_protein
        ]

        file_checkers = {
            predicted_orfs: utils.check_gzip_file
        }

        # todo: implement file checker for fasta files
        shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
            file_checkers=file_checkers, overwrite=args.overwrite, call=call)
def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False):
    """ This helper function processes a GTF file into its ORFs.
    """
    call = not args.do_not_call
    chr_name_file = os.path.join(config['star_index'], 'chrName.txt')
    chr_name_str = "--chr-name-file {}".format(chr_name_file)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    # extract a bed12 of the annotated ORFs
    transcript_bed = filenames.get_bed(config['genome_base_path'], 
        config['genome_name'], is_merged=False, is_annotated=is_annotated, 
        is_de_novo=is_de_novo)
    
    cmd = ("gtf-to-bed12 {} {} {} {} {}".format(gtf,
        transcript_bed, chr_name_str, cpus_str, logging_str))
    in_files = [gtf]
    out_files = [transcript_bed]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], is_annotated=is_annotated, is_de_novo=is_de_novo)

    cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format(transcript_bed, 
        exons_file, args.num_cpus, logging_str))
    in_files = [transcript_bed]
    out_files = [exons_file]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    # extract the transcript fasta
    transcript_fasta = filenames.get_transcript_fasta(config['genome_base_path'], 
        config['genome_name'], is_annotated=is_annotated, is_de_novo=is_de_novo)

    cmd = ("extract-bed-sequences {} {} {} {}".format(transcript_bed, 
        config['fasta'], transcript_fasta, logging_str))
    in_files = [transcript_bed, config['fasta']]
    out_files = [transcript_fasta]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    # new approach for extracting orfs
    orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], 
        note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo)
    start_codons_str = utils.get_config_argument(config, 'start_codons')
    stop_codons_str = utils.get_config_argument(config, 'stop_codons')

    cmd = "extract-orf-coordinates {} {} {} {} {} {} {}".format(transcript_bed, 
        transcript_fasta, orfs_genomic, cpus_str, start_codons_str, 
        stop_codons_str, logging_str)

    in_files = [transcript_fasta, transcript_bed]
    out_files = [orfs_genomic]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'],
        note=config.get('orf_note'), is_annotated=is_annotated, is_de_novo=is_de_novo,
        is_orf=True)

    cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format(orfs_genomic, 
        exons_file, args.num_cpus, logging_str))
    in_files = [orfs_genomic]
    out_files = [exons_file]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    # label the orfs
    labeled_orfs = orfs_genomic # no need to keep the unannotated ones around

    # we always label wrt the annotated annotations
    annotated_bed = filenames.get_bed(config['genome_base_path'], 
        config['genome_name'], is_merged=False, is_annotated=True)

    de_novo_str = ""
    if is_de_novo:
         de_novo_str = "--label-prefix \"novel_\" --filter --nonoverlapping-label \"novel\""
    
    cmd = "label-orfs {} {} {} {} {} {} {}".format(annotated_bed, orfs_genomic, 
        exons_file, labeled_orfs, cpus_str, de_novo_str, logging_str)
    in_files = [annotated_bed, orfs_genomic, exons_file]
    #  since we are reusing the name, it will already exist
    out_files  = None # [] # [labeled_orfs]
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)
def create_figures(config_file, config, name, offsets_df, args):
    """ This function creates all of the figures in the preprocessing report
        for the given dataset.
    """
    logging_str = logging_utils.get_logging_options_string(args)
    note = config.get('note', None)

    note_str = filenames.get_note_string(note)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    image_type_str = "--image-type {}".format(args.image_type)

    min_read_length = int(offsets_df['length'].min())
    max_read_length = int(offsets_df['length'].max())

    min_read_length_str = "--min-read-length {}".format(min_read_length)
    max_read_length_str = "--max-read-length {}".format(max_read_length)

    msg = "{}: Getting and visualizing read length distribution".format(name)
    logger.info(msg)

    # all aligned reads
    genome_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                           name,
                                           note=note)

    # uniquely aligned reads
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                name,
                                                is_unique=is_unique,
                                                note=note)

    # the read length counts
    read_length_distribution = filenames.get_riboseq_read_length_distribution(
        config['riboseq_data'], name, note=note)

    # the plots
    cmd = "get-read-length-distribution {} {} --out {} {}".format(
        genome_bam, unique_filename, read_length_distribution, logging_str)
    in_files = [genome_bam, unique_filename]
    out_files = [read_length_distribution]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize all read counts
    title = None
    if 'riboseq_sample_name_map' in config:
        title = config['riboseq_sample_name_map'].get(name)
    if title is None:
        title = "{}{}".format(name, note_str)

    title_str = "{}, All aligned reads".format(title)
    title_str = "--title={}".format(shlex.quote(title_str))

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(False)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'],
        name,
        is_unique=False,
        note=note,
        image_type=args.image_type)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, sample_name, read_length_distribution_image,
        title_str, min_read_length_str, max_read_length_str)

    in_files = [read_length_distribution]
    out_files = [read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize unique read counts

    # we already have the title
    title_str = "{}, Uniquely aligned reads".format(title)
    title_str = "--title={}".format(shlex.quote(title_str))

    unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'],
        name,
        is_unique=is_unique,
        note=note,
        image_type=args.image_type)

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(True)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, sample_name,
        unique_read_length_distribution_image, title_str, min_read_length_str,
        max_read_length_str)
    in_files = [read_length_distribution]
    out_files = [unique_read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize the metagene profiles
    msg = "{}: Visualizing metagene profiles and Bayes' factors".format(name)
    logger.info(msg)

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        name,
                                                        is_unique=is_unique,
                                                        note=note)

    profile_bayes_factor = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], name, is_unique=is_unique, note=note)

    mp_df = pd.read_csv(metagene_profiles)

    for length in range(min_read_length, max_read_length + 1):

        mask_length = offsets_df['length'] == length

        # make sure we had some reads of that length
        if sum(mask_length) == 0:
            continue
        length_row = offsets_df[mask_length].iloc[0]

        # make sure we have enough reads to visualize
        if length_row[
                'highest_peak_profile_sum'] < args.min_visualization_count:
            continue

        # visualize the metagene profile
        metagene_profile_image = filenames.get_metagene_profile_image(
            config['riboseq_data'],
            name,
            image_type=args.image_type,
            is_unique=is_unique,
            length=length,
            note=note)

        title_str = "{}. length: {}".format(title, length)
        title_str = "--title {}".format(shlex.quote(title_str))
        cmd = ("create-read-length-metagene-profile-plot {} {} {} {}".format(
            metagene_profiles, length, metagene_profile_image, title_str))
        in_files = [metagene_profiles]
        out_files = [metagene_profile_image]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite,
                                       call=True)

        # and the Bayes' factor
        if args.show_read_length_bfs:
            metagene_profile_image = filenames.get_metagene_profile_bayes_factor_image(
                config['riboseq_data'],
                name,
                image_type=args.image_type,
                is_unique=is_unique,
                length=length,
                note=note)

            title_str = "Metagene profile Bayes' factors: {}. length: {}".format(
                title, length)
            title_str = "--title {}".format(shlex.quote(title))
            fontsize_str = "--font-size 25"

            cmd = ("visualize-metagene-profile-bayes-factor {} {} {} {} {}".
                   format(profile_bayes_factor, length, metagene_profile_image,
                          title_str, fontsize_str))

            in_files = [profile_bayes_factor]
            out_files = [metagene_profile_image]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=True)

    # the orf-type metagene profiles
    if args.show_orf_periodicity:
        msg = "{}: Visualizing the ORF type metagene profiles".format(title)
        logger.info(msg)

        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config, name, is_unique=is_unique)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                   "Skipping".format(name))
            logger.error(msg)
            return

        orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'))

        profiles = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                  name,
                                                  length=lengths,
                                                  offset=offsets,
                                                  is_unique=is_unique,
                                                  note=note_str)

        title_str = "{}, ORF-type periodicity".format(title)
        title_str = "--title {}".format(shlex.quote(title_str))

        orf_type_profile_base = filenames.get_orf_type_profile_base(
            config['riboseq_data'],
            name,
            length=lengths,
            offset=offsets,
            is_unique=is_unique,
            note=note,
            subfolder='orf-profiles')

        strand = "+"
        orf_type_profiles_forward = [
            filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                 orf_type, strand,
                                                 args.image_type)
            for orf_type in ribo_utils.orf_types
        ]

        strand = "-"
        orf_type_profiles_reverse = [
            filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                 orf_type, strand,
                                                 args.image_type)
            for orf_type in ribo_utils.orf_types
        ]

        cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".format(
            orfs_genomic, profiles, orf_type_profile_base, title_str,
            image_type_str, logging_str))

        in_files = [orfs_genomic, profiles]
        out_files = orf_type_profiles_forward + orf_type_profiles_reverse
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)
Example #7
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs all of the processing necessary to produce the "
        "signals used for later processing. In particular, it runs the standard "
        "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with "
        "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes "
        "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq "
        "signal for the most-expressed transcripts.")
    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (json) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--mem',
                        help="The amount of RAM to request",
                        default=default_mem)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--tmp',
                        help="The location for temp files",
                        default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    star_index = filenames.get_star_index(config['genome_base_path'],
                                          config['genome_name'],
                                          is_merged=False)

    models_base = config.get('models_base', default_models_base)

    # the first step is the standard riboseq preprocessing

    # handle do_not_call so that we _do_ call the preprocessing script,
    # but that it does not run anything
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = (
        "create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
        .format(riboseq_raw_data, args.config, args.name, args.num_cpus,
                do_not_call_argument, overwrite_argument, logging_str,
                star_str, tmp_str, flexbar_format_option_str,
                keep_intermediate_str, mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available,
    # we still want to call the base pipeline.
    #in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # create the metagene profiles
    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    seqids_to_keep_str = utils.get_config_argument(config, 'seqids_to_keep')
    start_upstream_str = utils.get_config_argument(
        config, 'metagene_profile_start_upstream', 'start-upstream')
    start_downstream_str = utils.get_config_argument(
        config, 'metagene_profile_start_downstream', 'start-downstream')
    end_upstream_str = utils.get_config_argument(
        config, 'metagene_profile_end_upstream', 'end-upstream')
    end_downstream_str = utils.get_config_argument(
        config, 'metagene_profile_end_downstream', 'end-downstream')

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {} {}"
           .format(riboseq_bam_filename, transcript_bed, metagene_profiles,
                   args.num_cpus, logging_str, seqids_to_keep_str,
                   start_upstream_str, start_downstream_str, end_upstream_str,
                   end_downstream_str))

    in_files = [riboseq_bam_filename, orfs_genomic]
    out_files = [metagene_profiles]
    file_checkers = {metagene_profiles: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], args.name, is_unique=is_unique, note=note)

    #periodic_models_str = utils.get_config_argument(config, 'periodic_models')
    #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models')
    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')

    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(
        non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(
        config, 'periodic_offset_start')
    periodic_offset_end_str = utils.get_config_argument(
        config, 'periodic_offset_end')
    metagene_profile_length_str = utils.get_config_argument(
        config, 'metagene_profile_length')
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains')
    iterations_str = utils.get_config_argument(config,
                                               'metagene_profile_iterations',
                                               'iterations')

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(
               metagene_profiles, metagene_profile_bayes_factors,
               args.num_cpus, periodic_models_str, non_periodic_models_str,
               periodic_offset_start_str, periodic_offset_end_str,
               metagene_profile_length_str, seed_str, chains_str,
               iterations_str, logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {metagene_profile_bayes_factors: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(
        metagene_profile_bayes_factors, periodic_offsets)
    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {periodic_offsets: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
        config, args.name, args.do_not_call, is_unique=is_unique)

    if len(lengths) == 0:
        msg = (
            "No periodic read lengths and offsets were found. Try relaxing "
            "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
            "and/or min_metagene_bf_likelihood. Qutting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')

    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = (
        "extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} "
        .format(unique_filename, orfs_genomic, exons_file, profiles_filename,
                lengths_str, offsets_str, logging_str, seqname_prefix_str,
                args.num_cpus))
    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    #todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)
Example #8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs the Rp-Bp and Rp-chi pipelines on a given sample. "
        "It requires a YAML config file that includes a number of keys. Please see the "
        "documentation for a complete description.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp', help="The temp directory", default=default_tmp)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)
    
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--profiles-only', help="If this flag is present, then only "
        "the ORF profiles will be created", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
           
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call


    # check that all of the necessary programs are callable
    programs =  [
                    'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles',
                    'estimate-orf-bayes-factors',
                    'select-final-prediction-set',
                    'create-orf-profiles',
                    'predict-translated-orfs'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
                        'riboseq_data',
                        'ribosomal_index',
                        'star_index',
                        'genome_base_path',
                        'genome_name',
                        'fasta',
                        'gtf'
                    ]
    utils.check_keys_exist(config, required_keys)

    
    # now, check if we want to use slurm
    msg = "use_slurm: {}".format(args.use_slurm)
    logger.debug(msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    note_str = config.get('note', None)

    # the first step is the standard riboseq preprocessing
    
    # handle do_not_call so that we _do_ call the preprocessing script, 
    # but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    # for a sample, we first create its filtered genome profile
    
    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(args.raw_data, 
            args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, 
            logging_str, star_str, tmp_str, flexbar_option_str, keep_intermediate_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(args.config, 
            args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str))
    shell_utils.check_call(cmd)
Example #9
0
def _create_figures(name_pretty_name_is_replicate, config, args):
    """ This function creates all of the figures in the prediction report
        for the given dataset.
    """
    name, pretty_name, is_replicate = name_pretty_name_is_replicate

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    logging_str = logging_utils.get_logging_options_string(args)

    note_str = config.get('note', None)
    out_note_str = config.get('note', None)
    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    image_type_str = "--image-type {}".format(args.image_type)
    num_cpus_str = "--num-cpus {}".format(args.num_cpus)

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # if this is a replicate, we do not worry about lengths and offsets
    if is_replicate:
        lengths = None
        offsets = None
    else:
        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config, name, is_unique=is_unique)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                   "Skipping".format(name))
            logger.error(msg)
            return

    unsmoothed_profiles = filenames.get_riboseq_profiles(
        config['riboseq_data'],
        name,
        length=lengths,
        offset=offsets,
        is_unique=is_unique,
        note=note_str,
        is_smooth=False)

    msg = "{}: creating the ORF types bar charts".format(name)
    logger.debug(msg)

    it = itertools.product(grouped_values, chisq_values, filtered_values)

    for is_grouped, is_chisq, is_filtered in it:

        is_grouped_str = ""
        if is_grouped:
            is_grouped_str = ", Grouped"

        is_filtered_str = ""
        if is_filtered:
            is_filtered_str = ", Filtered"

        if is_chisq:
            title_str = "{}{}{}, Rp-$\chi^2$".format(pretty_name,
                                                     is_grouped_str,
                                                     is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = None
            rw = None

            orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                is_chisq=True,
                is_filtered=is_filtered)

        else:
            title_str = "{}{}{}, Rp-Bp".format(pretty_name, is_grouped_str,
                                               is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = fraction
            rw = reweighting_iterations
            orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                fraction=f,
                reweighting_iterations=rw,
                is_filtered=is_filtered)

        use_groups_str = ""
        if is_grouped:
            use_groups_str = "--use-groups"

        orf_types_bar_chart = filenames.get_orf_types_bar_chart(
            config['riboseq_data'],
            name,
            length=lengths,
            offset=offsets,
            is_unique=is_unique,
            note=out_note_str,
            image_type=args.image_type,
            fraction=f,
            reweighting_iterations=rw,
            is_grouped=is_grouped,
            is_chisq=is_chisq,
            is_filtered=is_filtered)

        cmd = "create-orf-types-bar-chart {} {} {} {}".format(
            orfs, orf_types_bar_chart, title_str, use_groups_str)

        in_files = [orfs]
        out_files = [orf_types_bar_chart]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

    msg = "{}: creating the ORF length distributions line graph".format(name)
    logger.debug(msg)

    uniprot_str = ""
    uniprot_label_str = ""
    if os.path.exists(args.uniprot):
        uniprot_str = "--uniprot {}".format(args.uniprot)
        uniprot_label_str = shlex.quote(args.uniprot_label)
        uniprot_label_str = "--uniprot-label {}".format(uniprot_label_str)

    for is_grouped in grouped_values:
        for is_chisq in chisq_values:

            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = None
                rw = None

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    is_chisq=True)

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    fraction=f,
                    reweighting_iterations=rw)

            use_groups_str = ""
            if is_grouped:
                use_groups_str = "--use-groups"

            orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str,
                image_type=args.image_type,
                fraction=f,
                reweighting_iterations=rw,
                is_grouped=is_grouped,
                is_chisq=is_chisq)

            cmd = (
                "create-orf-length-distribution-line-graph {} {} {} {} {} {}".
                format(orfs, orf_length_line_graph, title_str, use_groups_str,
                       uniprot_str, uniprot_label_str))

            in_files = [orfs]
            out_files = [orf_length_line_graph]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite)

    if args.show_orf_periodicity:
        msg = "{}: creating the ORF type metagene profiles".format(name)
        logger.debug(msg)

        for is_chisq in chisq_values:

            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)
                f = None
                rw = None
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    is_chisq=True,
                    is_filtered=is_filtered)

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    fraction=f,
                    reweighting_iterations=rw)

            orf_type_profile_base = filenames.get_orf_type_profile_base(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str,
                fraction=f,
                reweighting_iterations=rw,
                is_chisq=is_chisq)

            strand = "+"
            orf_type_profiles_forward = [
                filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                     orf_type, strand,
                                                     args.image_type)
                for orf_type in ribo_utils.orf_types
            ]

            strand = "-"
            orf_type_profiles_reverse = [
                filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                     orf_type, strand,
                                                     args.image_type)
                for orf_type in ribo_utils.orf_types
            ]

            cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".
                   format(orfs, profiles, orf_type_profile_base, title_str,
                          image_type_str, logging_str))

            in_files = [orfs]
            out_files = orf_type_profiles_forward + orf_type_profiles_reverse
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite)
Example #10
0
def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False):
    """ This helper function processes a GTF file into its ORFs.
    """
    call = not args.do_not_call
    chr_name_file = os.path.join(config['star_index'], 'chrName.txt')
    chr_name_str = "--chr-name-file {}".format(chr_name_file)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    # extract a bed12 of the annotated ORFs
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=is_annotated,
                                       is_de_novo=is_de_novo)

    cmd = ("gtf-to-bed12 {} {} --num-cpus {} {} {}".format(
        gtf, transcript_bed, args.num_cpus, chr_name_str, logging_str))
    in_files = [gtf]
    out_files = [transcript_bed]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     is_annotated=is_annotated,
                                     is_de_novo=is_de_novo)

    cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format(
        transcript_bed, exons_file, args.num_cpus, logging_str))
    in_files = [transcript_bed]
    out_files = [exons_file]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # extract the transcript fasta
    transcript_fasta = filenames.get_transcript_fasta(
        config['genome_base_path'],
        config['genome_name'],
        is_annotated=is_annotated,
        is_de_novo=is_de_novo)

    cmd = ("extract-bed-sequences {} {} {} {}".format(transcript_bed,
                                                      config['fasta'],
                                                      transcript_fasta,
                                                      logging_str))
    in_files = [transcript_bed, config['fasta']]
    out_files = [transcript_fasta]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # new approach for extracting orfs
    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'),
                                      is_annotated=is_annotated,
                                      is_de_novo=is_de_novo)
    start_codons_str = utils.get_config_argument(config, 'start_codons')
    stop_codons_str = utils.get_config_argument(config, 'stop_codons')

    cmd = "extract-orf-coordinates {} {} {} {} {} {} {}".format(
        transcript_bed, transcript_fasta, orfs_genomic, cpus_str,
        start_codons_str, stop_codons_str, logging_str)

    in_files = [transcript_fasta, transcript_bed]
    out_files = [orfs_genomic]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'),
                                     is_annotated=is_annotated,
                                     is_de_novo=is_de_novo)

    cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format(
        orfs_genomic, exons_file, args.num_cpus, logging_str))
    in_files = [orfs_genomic]
    out_files = [exons_file]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # label the orfs
    labeled_orfs = orfs_genomic  # no need to keep the unannotated ones around

    # we always label wrt the annotated annotations
    annotated_bed = filenames.get_bed(config['genome_base_path'],
                                      config['genome_name'],
                                      is_merged=False,
                                      is_annotated=True)

    de_novo_str = ""
    if is_de_novo:
        de_novo_str = "--label-prefix \"novel_\" --filter --nonoverlapping-label \"novel\""

    cmd = "label-orfs {} {} {} {} {} {} {}".format(annotated_bed, orfs_genomic,
                                                   exons_file, labeled_orfs,
                                                   cpus_str, de_novo_str,
                                                   logging_str)
    in_files = [annotated_bed, orfs_genomic, exons_file]
    #  since we are reusing the name, it will already exist
    out_files = None  # [] # [labeled_orfs]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)
Example #11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + [
                'orf_num', 'orf_len', 'orf_type'
            ]
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

    else:
        # finally, make sure our files are named correctly

        if os.path.exists(annotated_orfs):
            utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            utils.create_symlink(annotated_exons_file, exons_file, call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This is a helper script which submits a set of samples to SLURM. It "
        "can also be used to run a set of samples sequentially. Due to limitations on "
        "the config file specification, all of the samples must use the same reference "
        "indices (i.e., genome sequence, set of ORFs, etc.).")

    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--tmp', help="The temp directory", default=default_tmp)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)

    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--profiles-only', help="If this flag is present, then only "
        "the pre-processing part of the pipeline will be called, i.e. profiles "
        "will be created for each sample specified in the config file, but no predictions"
        "will be made.", action='store_true')

    parser.add_argument('--merge-replicates', help="If this flag is present, then "
        "the ORF profiles from the replicates will be merged before making the final "
        "predictions", action='store_true')

    parser.add_argument('--run-replicates', help="If this flag is given with the "
        "--merge-replicates flag, then both the replicates *and* the individual "
        "samples will be run. This flag has no effect if --merge-replicates is not "
        "given.", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
    
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [
                    'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles',
                    'estimate-orf-bayes-factors',
                    'select-final-prediction-set',
                    'create-orf-profiles',
                    'predict-translated-orfs',
                    'run-rpbp-pipeline'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
                        'riboseq_data',
                        'riboseq_samples',
                        'ribosomal_index',
                        'star_index',
                        'genome_base_path',
                        'genome_name',
                        'fasta',
                        'gtf'
                    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # check if we only want to create the profiles, in this case
    # we call run-rpbp-pipeline with the --profiles-only option
    profiles_only_str = ""
    if args.profiles_only:
        args.merge_replicates = False
        profiles_only_str = "--profiles-only"
        msg = ("The --profiles-only option was given, this will override --merge-replicates "
               "and/or --run-replicates, if these options were also given!")
        logger.info(msg)

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles, but we still make predictions
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = ("The --run-replicates option was given without the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)
    
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)
    
    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    
    # collect the job_ids in case we are using slurm and need to merge replicates
    job_ids = []

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, 
            args.config, 
            sample_name, 
            args.num_cpus, 
            tmp_str, 
            do_not_call_str, 
            overwrite_str, 
            logging_str, 
            star_str, 
            profiles_only_str,
            flexbar_option_str,
            keep_intermediate_str,
            mem_str
        )

        job_id = slurm.check_sbatch(cmd, args=args)

        job_ids.append(job_id)

    # now, if we are running the "standard" pipeline, we are finished
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):
    
        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, 
            condition_name, 
            args.num_cpus, 
            do_not_call_str, 
            overwrite_str, 
            logging_str, 
            merge_replicates_str
        )

        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Distribute runs of process-oasc-scenario around a "
        "cluster using password-less ssh")

    clu.add_config(parser)
    parser.add_argument('oasc_scenarios_dir')

    parser.add_argument(
        '--store-out',
        help="If this is option is given, then the "
        "output of each of the distributed processes will be written here.",
        default=None)

    clu.add_num_cpus(parser)
    clu.add_cv_options(parser)
    clu.add_scheduler_options(parser)

    automl_utils.add_automl_options(parser)
    automl_utils.add_blas_options(parser)

    ssh_utils.add_ssh_options(parser)

    parser.add_argument(
        '--dry-run',
        help="If this flag is given, then the "
        "commands will be printed to the screen, but not executed",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # make sure the config file exists
    if not os.path.exists(args.config):
        msg = "Could not find the config file: {}".format(args.config)
        raise FileNotFoundError(msg)

    config = clu.get_config_options_string(args)
    cpus_str = clu.get_num_cpus_options_string(args)
    cv_str = clu.get_cv_options_string(args)
    scheduler_str = clu.get_scheduler_options_string(args)
    automl_str = automl_utils.get_automl_options_string(args)
    blas_str = automl_utils.get_blas_options_string(args)
    logging_str = logging_utils.get_logging_options_string(args)

    training_dir = os.path.join(args.oasc_scenarios_dir, "train")
    testing_dir = os.path.join(args.oasc_scenarios_dir, "test")
    scenarios = utils.listdir_full(training_dir)
    scenarios = [utils.get_basename(s) for s in scenarios if os.path.isdir(s)]

    commands = []
    for scenario in scenarios:
        train = os.path.join(training_dir, scenario)
        test = os.path.join(testing_dir, scenario)
        cmd = [
            "process-oasc-scenario", args.config, train, test, cpus_str,
            cv_str, scheduler_str, automl_str, blas_str, logging_str
        ]

        cmd = ' '.join(cmd)
        commands.append(cmd)

    # if this is a dry run, just print the commands and quit
    if args.dry_run:
        for cmd in commands:
            msg = "Skipping due to --dry-run flag"
            logger.info(cmd)
            logger.info(msg)

        return

    # otherwise, make the remote calls
    node_list, proc_list = ssh_utils.distribute_all(commands, args.node_list,
                                                    args.connection_timeout,
                                                    args.max_tries)

    ret = ssh_utils.wait_for_all_results(commands, node_list, proc_list)

    if ret is not None:
        (return_codes, stdouts, stderrs) = ret

        if args.store_out is not None:
            with open(args.store_out, 'w') as out:
                for i in range(len(return_codes)):
                    out.write(commands[i])
                    out.write("\n")
                    out.write(node_list[i])
                    out.write("\n")
                    out.write("return code: {}".format(return_codes[i]))
                    out.write("\n")
                    out.write("stdout: {}".format(stdouts[i]))
                    out.write("\n")
                    out.write("stderr: {}".format(stderrs[i]))
                    out.write("\n")
Example #14
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs all of the processing necessary to produce the "
        "signals used for later processing. In particular, it runs the standard "
        "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with "
        "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes "
        "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq "
        "signal for the most-expressed transcripts.")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--merge-replicates',
        help="If this flag is present, then the ORF "
        "profiles will be merged for all replicates in the condition given by <name>. The "
        "filenames, etc., will reflect the condition name, but not the lengths and offsets "
        "of the individual replicates.\n\nN.B. If this flag is is present, the --overwrite "
        "flag will automatically be set!",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[predict_translated_orfs]: {}".format(' '.join(sys.argv))
    logger.debug(msg)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = ['estimate-orf-bayes-factors', 'select-final-prediction-set']
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'fasta', 'genome_base_path', 'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    models_base = config.get('models_base', default_models_base)

    note_str = config.get('note', None)

    # we always need the ORFs
    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # first, check if we are merging replicates

    # either way, the following variables need to have values for the rest of
    # the pipeline: lengths, offsets, smooth_profiles
    if args.merge_replicates:
        msg = ("The --merge-replicates option was given, so --overwrite is "
               "being set to True.")
        logger.warning(msg)
        args.overwrite = True

        # now, actually merge the replicates
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)

        # we will not use the lengths and offsets in the filenames
        lengths = None
        offsets = None

        # we will also merge all of unsmoothed profiles
        replicate_profiles = [
            get_profile(name, config, args)
            for name in riboseq_replicates[args.name]
        ]

        replicate_profiles_str = ' '.join(replicate_profiles)

        profiles = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                  args.name,
                                                  length=lengths,
                                                  offset=offsets,
                                                  is_unique=is_unique,
                                                  note=note_str)

        cmd = "merge-replicate-orf-profiles {} {} {}".format(
            replicate_profiles_str, profiles, logging_str)
        in_files = replicate_profiles
        out_files = [profiles]

        # todo: implement file checker for mtx files
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite,
                                       call=call)

    else:
        # otherwise, just treat things as normal
        # get the lengths and offsets which meet the required criteria from
        # the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name, args.do_not_call, is_unique=is_unique)

        profiles = get_profile(args.name, config, args)

    # estimate the bayes factors
    bayes_factors = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name,
        length=lengths,
        offset=offsets,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    # the smoothing options
    min_length_str = utils.get_config_argument(config, 'min_orf_length',
                                               'min-length')
    max_length_str = utils.get_config_argument(config, 'max_orf_length',
                                               'max-length')
    min_profile_str = utils.get_config_argument(config, 'min_signal',
                                                'min-profile')

    fraction_str = utils.get_config_argument(config, 'smoothing_fraction',
                                             'fraction')
    reweighting_iterations_str = utils.get_config_argument(
        config, 'smoothing_reweighting_iterations', 'reweighting-iterations')

    # parse out all of the options from the config file, if they are present
    translated_models = filenames.get_models(models_base, 'translated')
    untranslated_models = filenames.get_models(models_base, 'untranslated')

    translated_models_str = ' '.join(translated_models)
    untranslated_models_str = ' '.join(untranslated_models)

    translated_models_str = "--translated-models {}".format(
        translated_models_str)
    untranslated_models_str = "--untranslated-models {}".format(
        untranslated_models_str)

    orf_types_str = utils.get_config_argument(config, 'orf_types')

    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains', 'chains')
    iterations_str = utils.get_config_argument(config,
                                               'translation_iterations',
                                               'iterations')

    chi_square_only_str = ""
    chi_square_only = False
    if 'chi_square_only' in config:
        chi_square_only = True
        chi_square_only_str = "--chi-square-only"

    cmd = (
        "estimate-orf-bayes-factors {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} "
        "--num-cpus {}".format(profiles, orfs_genomic, bayes_factors,
                               translated_models_str, untranslated_models_str,
                               logging_str, orf_types_str, min_length_str,
                               max_length_str, min_profile_str, fraction_str,
                               reweighting_iterations_str, seed_str,
                               iterations_str, chains_str, chi_square_only_str,
                               args.num_cpus))

    in_files = [profiles, orfs_genomic]
    in_files.extend(translated_models)
    in_files.extend(untranslated_models)
    out_files = [bayes_factors]
    file_checkers = {bayes_factors: utils.check_gzip_file}
    msg = "estimate-bayes-factors in_files: {}".format(in_files)
    logger.debug(msg)
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    is_chisq_values = [True, False]
    if chi_square_only:
        is_chisq_values = [True]

    for is_filtered in [True, False]:

        for is_chisq in is_chisq_values:

            filtered_str = ""
            if is_filtered:
                filtered_str = "--select-longest-by-stop --select-best-overlapping"

            if is_chisq:
                chisq_str = "--use-chi-square"
            else:
                chisq_str = ""

            # now, select the ORFs (longest for each stop codon) which pass the prediction filters
            predicted_orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                args.name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                fraction=fraction,
                reweighting_iterations=reweighting_iterations,
                is_filtered=is_filtered,
                is_chisq=is_chisq)

            predicted_orfs_dna = filenames.get_riboseq_predicted_orfs_dna(
                config['riboseq_data'],
                args.name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                fraction=fraction,
                reweighting_iterations=reweighting_iterations,
                is_filtered=is_filtered,
                is_chisq=is_chisq)

            predicted_orfs_protein = filenames.get_riboseq_predicted_orfs_protein(
                config['riboseq_data'],
                args.name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                fraction=fraction,
                reweighting_iterations=reweighting_iterations,
                is_filtered=is_filtered,
                is_chisq=is_chisq)

            min_bf_mean_str = utils.get_config_argument(config, 'min_bf_mean')
            max_bf_var_str = utils.get_config_argument(config, 'max_bf_var')
            min_bf_likelihood_str = utils.get_config_argument(
                config, 'min_bf_likelihood')

            chisq_significance_level_str = utils.get_config_argument(
                config, 'chisq_significance_level')
            min_profile_str = utils.get_config_argument(
                config, 'min_signal', 'minimum-profile-sum')

            cmd = "select-final-prediction-set {} {} {} {} {} {} {} {} {} {} {}".format(
                bayes_factors, config['fasta'], predicted_orfs,
                predicted_orfs_dna, predicted_orfs_protein, min_bf_mean_str,
                max_bf_var_str, min_bf_likelihood_str, logging_str, chisq_str,
                filtered_str)

            in_files = [bayes_factors, config['fasta']]
            out_files = [
                predicted_orfs, predicted_orfs_dna, predicted_orfs_protein
            ]

            file_checkers = {predicted_orfs: utils.check_gzip_file}

            # todo: implement file checker for fasta files
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           file_checkers=file_checkers,
                                           overwrite=args.overwrite,
                                           call=call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")
    
    parser.add_argument('--peptide-filter-field', help="The field to use for "
        "filtering the peptides from MaxQuant", default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value', help="All peptides with a value "
        "greater than the filter value will be removed", type=float, 
        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator', help="The separator in the "
        "peptide file", default=default_peptide_separator)

    parser.add_argument('--note', help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.", 
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    programs = [
        'get-orf-peptide-matches'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files',
        'peptide_cell_type_analysis',
        'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items():
        if cell_type not in cell_types:
            msg = ("Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue
            
        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                format(cell_type_protein))
            logger.warning(msg)
            continue
            
        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = ("Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue
                
            peptide_txt_file = config['peptide_files'][peptide_file]
                
            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".
                    format(peptide_txt_file))
                logger.warning(msg)
                continue
                
            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'], cell_type, peptide_file, 
                is_filtered=True, note=out_note_str)
                
            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(cell_type_protein, 
                peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, 
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently. One sparse matrix file will be created for "
        "each read length. It then collects the values into a sparse tensor.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name',
                        help="The name for the dataset, used in the "
                        "created files")

    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles and read lengths")

    parser.add_argument(
        '-c',
        '--is-condition',
        help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.",
        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)

    orfs = filenames.get_orfs(config['genome_base_path'],
                              config['genome_name'],
                              note=orf_note)

    exons = filenames.get_exons(config['genome_base_path'],
                                config['genome_name'],
                                note=orf_note)

    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # check which samples to process
    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)

        # now the relevant files
        bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                        name,
                                        is_unique=is_unique,
                                        note=note)

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, name, is_unique=is_unique)

        if len(lengths) == 0:
            msg = (
                "No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting."
            )
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            mtx = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                 name,
                                                 length=[length],
                                                 offset=[offset],
                                                 is_unique=is_unique,
                                                 note=note)

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str,
                cpus_str, logging_str)

            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    # now, collect them into a single file
    offsets_str = ' '.join(str(o) for o in offsets)
    lengths_str = ' '.join(str(l) for l in lengths)

    offsets_str = "--offsets {}".format(offsets_str)
    lengths_str = "--lengths {}".format(lengths_str)

    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config, args.name, args.out, is_condition_str, logging_str)

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for "
                        "filtering the peptides from MaxQuant",
                        default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value "
                        "greater than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the "
                        "peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    programs = ['get-orf-peptide-matches']
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(
        args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(
        args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict,
                                                      'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items(
    ):
        if cell_type not in cell_types:
            msg = (
                "Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue

        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                   format(cell_type_protein))
            logger.warning(msg)
            continue

        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = (
                    "Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue

            peptide_txt_file = config['peptide_files'][peptide_file]

            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".format(
                    peptide_txt_file))
                logger.warning(msg)
                continue

            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'],
                cell_type,
                peptide_file,
                is_filtered=True,
                note=out_note_str)

            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(
                cell_type_protein, peptide_txt_file, peptide_matches,
                num_cpus_str, peptide_filter_field_str,
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
def _create_figures(name_pretty_name_is_replicate, config, args):
    """ This function creates all of the figures in the prediction report
        for the given dataset.
    """
    name, pretty_name, is_replicate = name_pretty_name_is_replicate
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    logging_str = logging_utils.get_logging_options_string(args)

    note_str = config.get('note', None)
    out_note_str = config.get('note', None)
    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    image_type_str = "--image-type {}".format(args.image_type)
    num_cpus_str = "--num-cpus {}".format(args.num_cpus)
   
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    # if this is a replicate, we do not worry about lengths and offsets
    if is_replicate:
        lengths = None
        offsets = None
    else:
        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config, name, is_unique=is_unique)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                "Skipping".format(name))
            logger.error(msg)
            return
        
    unsmoothed_profiles = filenames.get_riboseq_profiles(
        config['riboseq_data'],
        name,
        length=lengths,
        offset=offsets,
        is_unique=is_unique,
        note=note_str,
        is_smooth=False
    )

    msg = "{}: creating the ORF types bar charts".format(name)
    logger.debug(msg)

    it = itertools.product(grouped_values, chisq_values, filtered_values)

    for is_grouped, is_chisq, is_filtered in it:

        is_grouped_str = ""
        if is_grouped:
            is_grouped_str = ", Grouped"

        is_filtered_str = ""
        if is_filtered:
            is_filtered_str = ", Filtered"
        
        if is_chisq:
            title_str = "{}{}{}, Rp-$\chi^2$".format(pretty_name, is_grouped_str, is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = None
            rw = None

            orfs = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], 
                name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, 
                is_chisq=True, is_filtered=is_filtered)

        else:
            title_str = "{}{}{}, Rp-Bp".format(pretty_name, is_grouped_str, is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = fraction
            rw = reweighting_iterations
            orfs = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], 
                name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, 
                fraction=f, reweighting_iterations=rw, 
                is_filtered=is_filtered)

        use_groups_str = ""
        if is_grouped:
            use_groups_str = "--use-groups"
        
        orf_types_bar_chart = filenames.get_orf_types_bar_chart(
            config['riboseq_data'], 
            name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=out_note_str, 
            image_type=args.image_type,
            fraction=f, 
            reweighting_iterations=rw,
            is_grouped=is_grouped, 
            is_chisq=is_chisq, 
            is_filtered=is_filtered
        )

        cmd = "create-orf-types-bar-chart {} {} {} {}".format(
            orfs, 
            orf_types_bar_chart,
            title_str, 
            use_groups_str
        )

        in_files = [orfs]
        out_files = [orf_types_bar_chart]
        shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
            overwrite=args.overwrite)

    
    msg = "{}: creating the ORF length distributions line graph".format(name)
    logger.debug(msg)

    uniprot_str = ""
    uniprot_label_str = ""
    if os.path.exists(args.uniprot):
        uniprot_str = "--uniprot {}".format(args.uniprot)
        uniprot_label_str = shlex.quote(args.uniprot_label)
        uniprot_label_str = "--uniprot-label {}".format(uniprot_label_str)

    for is_grouped in grouped_values:
        for is_chisq in chisq_values:
            
            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = None
                rw = None
                
                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'], 
                    name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=note_str, 
                    is_chisq=True
                )

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'], 
                    name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=note_str, 
                    fraction=f, 
                    reweighting_iterations=rw
                )


            use_groups_str = ""
            if is_grouped:
                use_groups_str = "--use-groups"
            
            orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                config['riboseq_data'], 
                name, 
                length=lengths, 
                offset=offsets, 
                is_unique=is_unique, 
                note=out_note_str, 
                image_type=args.image_type,
                fraction=f, 
                reweighting_iterations=rw,
                is_grouped=is_grouped, 
                is_chisq=is_chisq
            )

            cmd = ("create-orf-length-distribution-line-graph {} {} {} {} {} {}".format(
                orfs, 
                orf_length_line_graph, 
                title_str, 
                use_groups_str, 
                uniprot_str, 
                uniprot_label_str
            ))

            in_files = [orfs]
            out_files = [orf_length_line_graph]
            shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
                overwrite=args.overwrite)

    if args.show_orf_periodicity:
        msg = "{}: creating the ORF type metagene profiles".format(name)
        logger.debug(msg)

        for is_chisq in chisq_values:
            
            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)
                f = None
                rw = None
                is_smooth = False
                profiles = unsmoothed_profiles
        
                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'], 
                    name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=note_str, 
                    is_chisq=True, 
                    is_filtered=is_filtered
                )

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'], 
                    name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=note_str, 
                    fraction=f, 
                    reweighting_iterations=rw
                )

            
            orf_type_profile_base = filenames.get_orf_type_profile_base(
                config['riboseq_data'], 
                name, 
                length=lengths, 
                offset=offsets, 
                is_unique=is_unique, 
                note=out_note_str, 
                fraction=f, 
                reweighting_iterations=rw,
                is_chisq=is_chisq
            )

            strand = "+"
            orf_type_profiles_forward = [
                filenames.get_orf_type_profile_image(
                    orf_type_profile_base, 
                    orf_type, 
                    strand, 
                    args.image_type
                )   for orf_type in ribo_utils.orf_types
            ]
            
            strand = "-"
            orf_type_profiles_reverse = [
                filenames.get_orf_type_profile_image(
                    orf_type_profile_base,
                    orf_type,
                    strand,
                    args.image_type
                )   for orf_type in ribo_utils.orf_types
            ]

            cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".format(
                orfs,
                profiles,
                orf_type_profile_base,
                title_str, 
                image_type_str,
                logging_str
            ))

            in_files = [orfs]
            out_files = orf_type_profiles_forward + orf_type_profiles_reverse
            shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
                overwrite=args.overwrite)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs the Rp-Bp and Rp-chi pipelines on a given sample. "
        "It requires a YAML config file that includes a number of keys. Please see the "
        "documentation for a complete description.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp',
                        help="The temp directory",
                        default=default_tmp)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="If this flag is present, then only "
                        "the ORF profiles will be created",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path',
        'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # now, check if we want to use slurm
    msg = "use_slurm: {}".format(args.use_slurm)
    logger.debug(msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    note_str = config.get('note', None)

    # the first step is the standard riboseq preprocessing

    # handle do_not_call so that we _do_ call the preprocessing script,
    # but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    # for a sample, we first create its filtered genome profile

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
           .format(args.raw_data, args.config, args.name, args.num_cpus,
                   mem_str, do_not_call_str, overwrite_str, logging_str,
                   star_str, tmp_str, flexbar_format_option_str,
                   keep_intermediate_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(
        args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str,
        logging_str))
    shell_utils.check_call(cmd)
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs all of the processing necessary to produce the "
        "signals used for later processing. In particular, it runs the standard "
        "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with "
        "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes "
        "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq "
        "signal for the most-expressed transcripts.")
    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", 
        default=default_mem)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)

    parser.add_argument('--tmp', help="The location for temp files", default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
            
    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [   'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   'riboseq_data',
                        'ribosomal_index',
                        'gtf',
                        'genome_base_path',
                        'genome_name'
                    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    star_index = filenames.get_star_index(config['genome_base_path'], 
        config['genome_name'], is_merged=False)

    models_base = config.get('models_base', default_models_base)

    # the first step is the standard riboseq preprocessing
    
    # handle do_not_call so that we _do_ call the preprocessing script, 
    # but that it does not run anything
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], 
        note=config.get('orf_note'))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, 
        is_unique=is_unique, note=note)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
        .format(riboseq_raw_data, args.config, args.name, args.num_cpus, 
        do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str,
        flexbar_option_str, keep_intermediate_str, mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    #in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True) 

    # create the metagene profiles
    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], 
        args.name, is_unique=is_unique, note=note)

    start_upstream_str = utils.get_config_argument(config, 
        'metagene_profile_start_upstream', 'start-upstream')
    start_downstream_str = utils.get_config_argument(config, 
        'metagene_profile_start_downstream', 'start-downstream')
    end_upstream_str = utils.get_config_argument(config, 
        'metagene_profile_end_upstream', 'end-upstream')
    end_downstream_str = utils.get_config_argument(config, 
        'metagene_profile_end_downstream', 'end-downstream')

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'], 
        config['genome_name'], is_merged=False, is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}"
        .format(riboseq_bam_filename, transcript_bed, metagene_profiles, 
        args.num_cpus, logging_str, start_upstream_str,
        start_downstream_str, end_upstream_str, end_downstream_str))

    in_files = [riboseq_bam_filename, orfs_genomic]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], args.name, is_unique=is_unique, note=note)

    #periodic_models_str = utils.get_config_argument(config, 'periodic_models')
    #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models')
    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)
    
    periodic_offset_start_str = utils.get_config_argument(config, 'periodic_offset_start')
    periodic_offset_end_str = utils.get_config_argument(config, 'periodic_offset_end')
    metagene_profile_length_str = utils.get_config_argument(config, 'metagene_profile_length')
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains')
    iterations_str = utils.get_config_argument(config, 'metagene_profile_iterations', 'iterations')

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
        "{} {} {} {} {} {} {}".format(metagene_profiles, 
        metagene_profile_bayes_factors, args.num_cpus, 
        periodic_models_str, non_periodic_models_str,
        periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str,
        seed_str, chains_str, iterations_str, logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
        args.name, is_unique=is_unique, note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors, 
        periodic_offsets)
    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, 
        args.name, args.do_not_call, is_unique=is_unique)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
            "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
            "and/or min_metagene_bf_likelihood. Qutting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, 
        is_unique=is_unique, note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, 
        length=lengths, offset=offsets, is_unique=is_unique, note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], 
        note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'],
        note=config.get('orf_note'), is_orf=True)

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
            unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, 
            offsets_str, logging_str, seqname_prefix_str, args.num_cpus))
    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    #todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')
    
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [
                 'extract-orf-coordinates',
                 'label-orfs',
                 'bowtie2-build-s',
                 'split-bed12-blocks',
                 'gtf-to-bed12',
                 args.star_executable
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   'genome_base_path',
                        'genome_name',
                        'gtf',
                        'fasta',
                        'ribosomal_fasta',
                        'ribosomal_index',
                        'star_index'
                    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [
        config['gtf'],
        config['fasta'],
        config['ribosomal_fasta']
    ]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return
   
    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], 
        config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)
    
    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
        "--runThreadN {} --limitGenomeGenerateRAM {}".format(args.star_executable, 
        config['star_index'], config['fasta'], args.num_cpus, mem))
        
    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), is_annotated=True,
        is_de_novo=False)
   
    annotated_exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), 
        is_annotated=True, is_de_novo=False, is_orf=True)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), is_orf=True)

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
        config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True)
   
    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, 
            is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'), is_annotated=False,
            is_de_novo=True)
       
        de_novo_exons_file = filenames.get_exons(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'), 
            is_annotated=False, is_de_novo=True, is_orf=True)


        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        
        exons_files = [annotated_exons_file, de_novo_exons_file]
        
        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True)
            fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start']
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                overwrite=args.overwrite, call=call)
        else:
            msg = ("Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                  "for reference and do novo annotations. Symlink to reference annotations created.")
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # finally, make sure our files are named correctly
        
        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently. One sparse matrix file will be created for "
        "each read length. It then collects the values into a sparse tensor.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name', help="The name for the dataset, used in the "
        "created files")
    
    parser.add_argument('out', help="The (mtx.gz) output file containing the "
        "ORF profiles and read lengths")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))
 
    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)

    
    orfs = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=orf_note
    )

    exons = filenames.get_exons(
        config['genome_base_path'], 
        config['genome_name'],
        note=orf_note,
        is_orf=True
    )
    
    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # check which samples to process
    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # now the relevant files
        bam = filenames.get_riboseq_bam(
            config['riboseq_data'], 
            name, 
            is_unique=is_unique, 
            note=note
        )

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return


        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            
            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset],
                is_unique=is_unique, 
                note=note
            )

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam,
                orfs,
                exons,
                mtx,
                lengths_str,
                offsets_str,
                seqname_str,
                cpus_str,
                logging_str
            )
            
            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    # now, collect them into a single file
    offsets_str = ' '.join(str(o) for o in offsets)
    lengths_str = ' '.join(str(l) for l in lengths)

    offsets_str = "--offsets {}".format(offsets_str)
    lengths_str = "--lengths {}".format(lengths_str)

    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config,
        args.name,
        args.out,
        is_condition_str,
        logging_str
    )

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Example #23
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This is a helper script which submits a set of samples to SLURM. It "
        "can also be used to run a set of samples sequentially. Due to limitations on "
        "the config file specification, all of the samples must use the same reference "
        "indices (i.e., genome sequence, set of ORFs, etc.).")

    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--tmp',
                        help="The temp directory",
                        default=default_tmp)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--merge-replicates',
        help="If this flag is present, then "
        "the ORF profiles from the replicates will be merged before making the final "
        "predictions",
        action='store_true')

    parser.add_argument(
        '--run-replicates',
        help="If this flag is given with the "
        "--merge-replicates flag, then both the replicates *and* the individual "
        "samples will be run. This flag has no effect if --merge-replicates is not "
        "given.",
        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index',
        'genome_base_path', 'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles
    profiles_only_str = ""
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = (
            "The --run-replicates option was given with the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    # collect the job_ids in case we are using slurm and need to merge replicates
    job_ids = []

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp,
                               "{}_{}_rpbp".format(sample_name, note))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, args.config, sample_name, args.num_cpus, tmp_str,
            do_not_call_str, overwrite_str, logging_str, star_str,
            profiles_only_str, flexbar_format_option_str,
            keep_intermediate_str, mem_str)

        job_id = slurm.check_sbatch(cmd, args=args)

        job_ids.append(job_id)

    # now, if we are running the "standard" pipeline, we are finished
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):

        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, condition_name, args.num_cpus, do_not_call_str,
            overwrite_str, logging_str, merge_replicates_str)

        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)