def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text."
    )
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument(
        '--show-read-length-bfs',
        help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.",
        action='store_true')

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument('--min-visualization-count',
                        help="Read lengths with fewer than this "
                        "number of reads will not be included in the report.",
                        type=int,
                        default=metagene_options['min_metagene_image_count'])

    parser.add_argument('--image-type',
                        help="The type of image types to create. This "
                        "must be an extension which matplotlib can interpret.",
                        default=default_image_type)

    parser.add_argument(
        '-c',
        '--create-fastqc-reports',
        help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.",
        action='store_true')

    parser.add_argument('--tmp',
                        help="If the fastqc reports are created, "
                        "they will use this location for temp files",
                        default=None)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    if args.note is not None:
        config['note'] = args.note
    note = config.get('note', None)

    sample_names = sorted(config['riboseq_samples'].keys())

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts', 'samtools',
        'visualize-read-filtering-counts', 'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc', 'java'])

    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create the read filtering information...
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(
            config['riboseq_data'], name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        'min_metagene_profile_count',
        metagene_options['min_metagene_profile_count'])

    min_bf_mean = config.get('min_metagene_bf_mean',
                             metagene_options['min_metagene_bf_mean'])

    max_bf_var = config.get('max_metagene_bf_var',
                            metagene_options['max_metagene_bf_var'])

    min_bf_likelihood = config.get(
        'min_metagene_bf_likelihood',
        metagene_options['min_metagene_bf_likelihood'])

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)

        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out,
                            read_filtering_caption,
                            label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out,
                      "Read length distributions",
                      label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out,
                           ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=False,
                note=note,
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=True,
                note=note,
                image_type=args.image_type)

            msg = "Looking for image file: {}".format(
                read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out,
                                     read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)

            msg = "Looking for image file: {}".format(
                unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out,
                                     unique_read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    unique_read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

        latex.end_table(out)
        latex.clearpage(out)

        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))

            periodic_offsets = filenames.get_periodic_offsets(
                config['riboseq_data'], name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used

                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"

                if max_bf_var is not None:
                    if ((length_row['highest_peak_bf_mean'] <= min_bf_mean) or
                            length_row['highest_peak_bf_var'] >= max_bf_var):
                        offset_status = "BF mean too small or BF var too high"

                if min_bf_likelihood is not None:
                    likelihood = 1 - scipy.stats.norm.cdf(
                        min_bf_mean, length_row['highest_peak_bf_mean'],
                        np.sqrt(length_row['highest_peak_bf_var']))
                    if likelihood <= min_bf_likelihood:
                        offset_status = "Likehood too small"

                if (max_bf_var is None) and (min_bf_likelihood is None):
                    if length_row['highest_peak_bf_mean'] <= min_bf_mean:
                        offset_status = "BF mean too small"

                if length_row[
                        'highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"

                if length_row[
                        'highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'],
                    name,
                    image_type=args.image_type,
                    is_unique=is_unique,
                    length=length,
                    note=note)

                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(
                    length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)

                i += 1
                if i % 2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)

                if args.show_read_length_bfs:

                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'],
                        name,
                        image_type=args.image_type,
                        is_unique=is_unique,
                        length=length,
                        note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)

                    i += 1
                    if i % 2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

            if i % 2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)

            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note,
                    subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base,
                            orf_type,
                            strand,
                            image_type=args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i % 4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i > 0) and (i % 4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                     args.num_cpus, create_fastqc_reports,
                                     config, args)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Collect the individual read length ORF profiles (mtx) created "
        "by 'create-read-length-orf-profiles' into a single 'sparse tensor'. "
        "N.B. This script is called by 'create-read-length-orf-profiles', however"
        "we still call each sample independently for condition, lengths and offsets")
    
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name of either one of the 'riboseq_samples'"
        "or 'riboseq_biological_replicates' from the config file.")
    
    parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses"
        "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written"
        "using the matrix market format (base-1 indexing).")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then 'name' will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    parser.add_argument('--add-ids', help="If this flag is present, "
        "then orf_ids will be added to the final output.", action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)
    note = config.get('note', None)

    if args.add_ids:
        orf_note = config.get('orf_note', None)
        orfs_file = filenames.get_orfs(
            config['genome_base_path'],
            config['genome_name'],
            note=orf_note
        )
        orfs = bed_utils.read_bed(orfs_file)

    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    # keep a map from the lengths to the combined profiles
    length_profile_map = {}

    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique,
            default_params=metagene_options
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
                        
            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset], 
                is_unique=is_unique, 
                note=note
            )

            mtx = scipy.io.mmread(mtx).tocsr()

            prior_mtx = length_profile_map.get(length, None)

            if prior_mtx is None:
                length_profile_map[length] = mtx
            else:
                length_profile_map[length] = prior_mtx + mtx

    if args.add_ids:
        with gzip.open(args.out, 'wb') as target_gz:

            for length, mtx in length_profile_map.items():
                mtx = mtx.tocoo()

                msg = "Writing ORF profiles. length: {}.".format(length)
                logger.info(msg)

                for row, col, val in zip(mtx.row, mtx.col, mtx.data):
                    # orf_num are both zero-based, since we are now using coo
                    orf_id = orfs.loc[orfs['orf_num'] == row]['id'].values[0]
                    s = "{} {} {} {} {}\n".format(row, orf_id, col, length, val)
                    target_gz.write(s.encode())
    else:
        with gzip.open(args.out, 'wb') as target_gz:

            for length, mtx in length_profile_map.items():
                mtx = mtx.tocoo()

                msg = "Writing ORF profiles. length: {}.".format(length)
                logger.info(msg)

                for row, col, val in zip(mtx.row, mtx.col, mtx.data):
                    s = "{} {} {} {}\n".format(row, col, length, val)
                    target_gz.write(s.encode())
def create_figures(config_file, config, name, offsets_df, args):
    """ This function creates all of the figures in the preprocessing report
        for the given dataset.
    """
    logging_str = logging_utils.get_logging_options_string(args)
    note = config.get('note', None)

    note_str = filenames.get_note_string(note)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    image_type_str = "--image-type {}".format(args.image_type)

    min_read_length = int(offsets_df['length'].min())
    max_read_length = int(offsets_df['length'].max())

    min_read_length_str = "--min-read-length {}".format(min_read_length)
    max_read_length_str = "--max-read-length {}".format(max_read_length)

    msg = "{}: Getting and visualizing read length distribution".format(name)
    logger.info(msg)

    # all aligned reads
    genome_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                           name,
                                           note=note)

    # uniquely aligned reads
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                name,
                                                is_unique=is_unique,
                                                note=note)

    # the read length counts
    read_length_distribution = filenames.get_riboseq_read_length_distribution(
        config['riboseq_data'], name, note=note)

    # the plots
    cmd = "get-read-length-distribution {} {} --out {} {}".format(
        genome_bam, unique_filename, read_length_distribution, logging_str)
    in_files = [genome_bam, unique_filename]
    out_files = [read_length_distribution]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize all read counts
    title = None
    if 'riboseq_sample_name_map' in config:
        title = config['riboseq_sample_name_map'].get(name)
    if title is None:
        title = "{}{}".format(name, note_str)

    title_str = "{}, All aligned reads".format(title)
    title_str = "--title={}".format(shlex.quote(title_str))

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(False)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'],
        name,
        is_unique=False,
        note=note,
        image_type=args.image_type)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, sample_name, read_length_distribution_image,
        title_str, min_read_length_str, max_read_length_str)

    in_files = [read_length_distribution]
    out_files = [read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize unique read counts

    # we already have the title
    title_str = "{}, Uniquely aligned reads".format(title)
    title_str = "--title={}".format(shlex.quote(title_str))

    unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'],
        name,
        is_unique=is_unique,
        note=note,
        image_type=args.image_type)

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(True)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, sample_name,
        unique_read_length_distribution_image, title_str, min_read_length_str,
        max_read_length_str)
    in_files = [read_length_distribution]
    out_files = [unique_read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize the metagene profiles
    msg = "{}: Visualizing metagene profiles and Bayes' factors".format(name)
    logger.info(msg)

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        name,
                                                        is_unique=is_unique,
                                                        note=note)

    profile_bayes_factor = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], name, is_unique=is_unique, note=note)

    mp_df = pd.read_csv(metagene_profiles)

    for length in range(min_read_length, max_read_length + 1):

        mask_length = offsets_df['length'] == length

        # make sure we had some reads of that length
        if sum(mask_length) == 0:
            continue
        length_row = offsets_df[mask_length].iloc[0]

        # make sure we have enough reads to visualize
        if length_row[
                'highest_peak_profile_sum'] < args.min_visualization_count:
            continue

        # visualize the metagene profile
        metagene_profile_image = filenames.get_metagene_profile_image(
            config['riboseq_data'],
            name,
            image_type=args.image_type,
            is_unique=is_unique,
            length=length,
            note=note)

        title_str = "{}. length: {}".format(title, length)
        title_str = "--title {}".format(shlex.quote(title_str))
        cmd = ("create-read-length-metagene-profile-plot {} {} {} {}".format(
            metagene_profiles, length, metagene_profile_image, title_str))
        in_files = [metagene_profiles]
        out_files = [metagene_profile_image]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite,
                                       call=True)

        # and the Bayes' factor
        if args.show_read_length_bfs:
            metagene_profile_image = filenames.get_metagene_profile_bayes_factor_image(
                config['riboseq_data'],
                name,
                image_type=args.image_type,
                is_unique=is_unique,
                length=length,
                note=note)

            title_str = "Metagene profile Bayes' factors: {}. length: {}".format(
                title, length)
            title_str = "--title {}".format(shlex.quote(title_str))
            fontsize_str = "--font-size 15"

            cmd = ("visualize-metagene-profile-bayes-factor {} {} {} {} {}".
                   format(profile_bayes_factor, length, metagene_profile_image,
                          title_str, fontsize_str))

            in_files = [profile_bayes_factor]
            out_files = [metagene_profile_image]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=True)

    # the orf-type metagene profiles
    if args.show_orf_periodicity:
        msg = "{}: Visualizing the ORF type metagene profiles".format(title)
        logger.info(msg)

        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config,
                name,
                is_unique=is_unique,
                default_params=metagene_options)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                   "Skipping".format(name))
            logger.error(msg)
            return

        orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'))

        profiles = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                  name,
                                                  length=lengths,
                                                  offset=offsets,
                                                  is_unique=is_unique,
                                                  note=note)

        title_str = "{}, ORF-type periodicity".format(title)
        title_str = "--title {}".format(shlex.quote(title_str))

        orf_type_profile_base = filenames.get_orf_type_profile_base(
            config['riboseq_data'],
            name,
            length=lengths,
            offset=offsets,
            is_unique=is_unique,
            note=note,
            subfolder='orf-profiles')

        strand = "+"
        orf_type_profiles_forward = [
            filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                 orf_type, strand,
                                                 args.image_type)
            for orf_type in ribo_utils.orf_types
        ]

        strand = "-"
        orf_type_profiles_reverse = [
            filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                 orf_type, strand,
                                                 args.image_type)
            for orf_type in ribo_utils.orf_types
        ]

        cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".format(
            orfs_genomic, profiles, orf_type_profile_base, title_str,
            image_type_str, logging_str))

        in_files = [orfs_genomic, profiles]
        out_files = orf_type_profiles_forward + orf_type_profiles_reverse
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)
def get_counts(name_data, config, args):
    name, data = name_data
    msg = "processing {}...".format(name)
    logger.info(msg)

    note = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # first, get the ribo_filenames
    raw_data = data
    without_adapters = ribo_filenames.get_without_adapters_fastq(
        config['riboseq_data'], name, note=note)
    with_rrna = ribo_filenames.get_with_rrna_fastq(config['riboseq_data'],
                                                   name,
                                                   note=note)
    without_rrna = ribo_filenames.get_without_rrna_fastq(
        config['riboseq_data'], name, note=note)
    genome_bam = ribo_filenames.get_riboseq_bam(config['riboseq_data'],
                                                name,
                                                note=note)
    unique_bam = ribo_filenames.get_riboseq_bam(config['riboseq_data'],
                                                name,
                                                is_unique=is_unique,
                                                note=note)

    # now count the reads of each type
    msg = "{}: collecting read counts".format(name)
    logger.info(msg)

    # get the read counts
    msg = "{}: counting reads in raw data".format(name)
    logger.info(msg)
    raw_data_count = fastx_utils.get_read_count(raw_data, is_fasta=False)

    msg = "{}: counting reads without adapters".format(name)
    logger.info(msg)
    without_adapters_count = fastx_utils.get_read_count(without_adapters,
                                                        is_fasta=False)

    msg = "{}: counting reads with rrna".format(name)
    logger.info(msg)
    with_rrna_count = fastx_utils.get_read_count(with_rrna, is_fasta=False)

    msg = "{}: counting reads without rrna".format(name)
    logger.info(msg)
    without_rrna_count = fastx_utils.get_read_count(without_rrna,
                                                    is_fasta=False)

    msg = "{}: counting genome-aligned reads".format(name)
    logger.info(msg)
    genome_count = bam_utils.count_aligned_reads(genome_bam)

    msg = "{}: counting uniquely-aligned reads".format(name)
    logger.info(msg)
    unique_count = bam_utils.count_aligned_reads(unique_bam)

    # count reads with correct lengths
    msg = "{}: counting reads with selected lengths".format(name)
    logger.info(msg)

    # now count the unique reads with the appropriate length
    try:
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, name, is_unique=is_unique)
        lengths_str = ','.join(lengths)
        length_counts = bam_utils.get_length_distribution(unique_bam)
        lengths = set([int(l) for l in lengths])
        m_lengths = length_counts['length'].isin(lengths)
        length_count = np.sum(length_counts.loc[m_lengths, 'count'])

        msg = (
            "{}: found the following periodic lengths: {}. The number of reads "
            "of these lengths: {}".format(name, lengths_str, length_count))
        logger.debug(msg)
    except ValueError as e:
        msg = (
            "Encountered a problem counting periodic reads. This probably "
            "means no read lengths were periodic. Error message: {}".format(e))
        logger.warning(msg)
        length_count = 0

    ret = {
        'note': name,
        'raw_data_count': raw_data_count,
        'without_adapters_count': without_adapters_count,
        'without_rrna_count': without_rrna_count,
        'genome_count': genome_count,
        'unique_count': unique_count,
        'length_count': length_count
    }

    return pd.Series(ret)
Example #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        pandas_utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
Example #6
0
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script runs all of the processing necessary to 
        produce the signals used for ORF translation prediction. In particular, it creates the 
        metagene profiles, selected the periodic fragments and generate the ORF profiles.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('--tmp', help="The location for temp files", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files 
        will be overwritten.""", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be deleted. 
        This feature is implemented piecemeal. If the --do-not-call flag is given, 
        then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'create-base-genome-profile',
        'remove-multimapping-reads',
        'extract-metagene-profiles',
        'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets',
        'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    models_base = config.get('models_base', default_models_base)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(
        riboseq_raw_data,
        args.config,
        args.name,
        args.num_cpus,
        do_not_call_argument,
        overwrite_argument,
        logging_str,
        star_str,
        tmp_str,
        flexbar_str,
        keep_intermediate_str,
        mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    # in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=True)

    # Extract the metagene profiles

    start_upstream_str = utils.get_config_argument(config,
                                                   'metagene_start_upstream',
                                                   'start-upstream',
                                                   default=metagene_options['metagene_start_upstream'])
    start_downstream_str = utils.get_config_argument(config,
                                                     'metagene_start_downstream',
                                                     'start-downstream',
                                                     default=metagene_options['metagene_start_downstream'])
    end_upstream_str = utils.get_config_argument(config,
                                                 'metagene_end_upstream',
                                                 'end-upstream',
                                                 default=metagene_options['metagene_end_upstream'])
    end_downstream_str = utils.get_config_argument(config,
                                                   'metagene_end_downstream',
                                                   'end-downstream',
                                                   default=metagene_options['metagene_end_downstream'])

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format(
        riboseq_bam_filename,
        transcript_bed,
        metagene_profiles,
        args.num_cpus,
        logging_str,
        start_upstream_str,
        start_downstream_str,
        end_upstream_str,
        end_downstream_str))

    in_files = [riboseq_bam_filename, transcript_bed]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'],
        args.name,
        is_unique=is_unique,
        note=note)

    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(config,
                                                          'periodic_offset_start',
                                                          default=metagene_options['periodic_offset_start'])
    periodic_offset_end_str = utils.get_config_argument(config,
                                                        'periodic_offset_end',
                                                        default=metagene_options['periodic_offset_end'])
    metagene_profile_length_str = utils.get_config_argument(config,
                                                            'metagene_profile_length',
                                                            default=metagene_options['metagene_profile_length'])
    seed_str = utils.get_config_argument(config,
                                         'seed',
                                         default=metagene_options['seed'])
    chains_str = utils.get_config_argument(config,
                                           'chains',
                                           default=metagene_options['chains'])
    iterations_str = utils.get_config_argument(config,
                                               'metagene_iterations',
                                               'iterations',
                                               default=metagene_options['metagene_iterations'])

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(metagene_profiles,
                                         metagene_profile_bayes_factors,
                                         args.num_cpus,
                                         periodic_models_str,
                                         non_periodic_models_str,
                                         periodic_offset_start_str,
                                         periodic_offset_end_str,
                                         metagene_profile_length_str,
                                         seed_str,
                                         chains_str,
                                         iterations_str,
                                         logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors,
                                                 periodic_offsets)

    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config,
                                                                   args.name,
                                                                   args.do_not_call,
                                                                   is_unique=is_unique,
                                                                   default_params=metagene_options)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
               "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
               "and/or min_metagene_bf_likelihood. Quitting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
        unique_filename,
        orfs_genomic,
        exons_file,
        profiles_filename,
        lengths_str,
        offsets_str,
        logging_str,
        seqname_prefix_str,
        args.num_cpus))

    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    # todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=call)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script plots the (log) Bayes factor against the estimated "
        "RPKM for all ORFs. All relevant values will be clipped according to the "
        "specified arguments for viewing.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name',
                        help="The name of the dataset or replicate to plot")
    parser.add_argument('out', help="The output image file")

    parser.add_argument(
        '-p',
        '--use-predictions',
        help="If this flag is present, then "
        "the \"predicted ORFs\" files will be used. Otherwise, all ORFs in the dataset "
        "will be visualized.",
        action='store_true')
    parser.add_argument(
        '-r',
        '--is-replicate',
        help="If the name corresponds to one "
        "of the replicates, this flag must be used to ensure the filenames are "
        "handled correctly.",
        action='store_true')

    parser.add_argument('--title', default=default_title)

    parser.add_argument('--min-rpkm', type=float, default=default_min_rpkm)
    parser.add_argument('--max-rpkm', type=float, default=default_max_rpkm)
    parser.add_argument('--min-bf', type=float, default=default_min_bf)
    parser.add_argument('--max-bf', type=float, default=default_max_bf)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
    note = config.get('note', None)

    if args.is_replicate:
        lengths = None
        offsets = None
    else:
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name)

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # we will need these to get the appropriate log BFs
    if args.use_predictions:
        bayes_factors = filenames.get_riboseq_predicted_orfs(
            config['riboseq_data'],
            args.name,
            length=lengths,
            offset=offsets,
            is_unique=True,
            note=note,
            is_smooth=True,
            fraction=fraction,
            reweighting_iterations=reweighting_iterations)
    else:
        bayes_factors = filenames.get_riboseq_bayes_factors(
            config['riboseq_data'],
            args.name,
            length=lengths,
            offset=offsets,
            is_unique=True,
            note=note,
            is_smooth=True,
            fraction=fraction,
            reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors):
        msg = (
            "Could not find the Bayes factor file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(bayes_factors))
        logger.warning(msg)
        return

    msg = "Reading Bayes factors"
    logger.info(msg)
    bayes_factors = bio.read_bed(bayes_factors)

    # we need these to get the raw counts for calculating RPKM

    # we always need all of the counts, so no need to check which ORFs
    rpchi_pvalues = filenames.get_riboseq_bayes_factors(config['riboseq_data'],
                                                        args.name,
                                                        length=lengths,
                                                        offset=offsets,
                                                        is_unique=True,
                                                        note=note,
                                                        is_smooth=False)

    if not os.path.exists(rpchi_pvalues):
        msg = (
            "Could not find the Rp-chi pvalues file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(rpchi_pvalues))
        logger.warning(msg)
        return

    msg = "Reading Rp-chi pvalues"
    logger.info(msg)
    rpchi_pvalues = bio.read_bed(rpchi_pvalues)

    msg = "Calculating RPKM values"
    logger.info(msg)

    # we approximate the number of mapping reads as the sum across all ORFs.
    # this double-counts some reads
    num_reads = np.sum(rpchi_pvalues['profile_sum'])
    all_rpkm = (1e6 * rpchi_pvalues['x_1_sum']) / (rpchi_pvalues['orf_len'] *
                                                   num_reads)

    # only include things that have some reads in the visualization
    m_rpkm = all_rpkm > 0

    msg = "Creating plot"
    logger.info(msg)

    fig, ax = plt.subplots(figsize=(10, 5))

    cm = plt.cm.gist_earth

    for i, orf_label in enumerate(ribo_utils.orf_type_labels):

        orf_types = ribo_utils.orf_type_labels_mapping[orf_label]
        m_type = bayes_factors['orf_type'].isin(orf_types)

        # now, pull out the RPKMs
        if args.use_predictions:
            # if we are using predictions, we have to filter and join
            orf_ids = bayes_factors.loc[m_rpkm & m_type, 'id']
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type,
                                             'bayes_factor_mean'])

            m_ids = rpchi_pvalues['id'].isin(orf_ids)
            rpkm = np.array(all_rpkm[m_ids])

        else:
            # otherwise ,the data frames match, so we can just use the masks
            rpkm = np.array(all_rpkm[m_rpkm & m_type])
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type,
                                             'bayes_factor_mean'])

        rpkm = np.clip(rpkm, args.min_rpkm, args.max_rpkm)
        bfs = np.clip(bfs, args.min_bf, args.max_bf)

        color = i / len(ribo_utils.orf_type_labels)
        color = cm(color)

        label = "{} ({})".format(orf_label, len(rpkm))

        ax.scatter(rpkm, bfs, label=label, color=color, edgecolor='k')

    ax.set_ylim((args.min_bf * 1.5, args.max_bf * 1.5))
    ax.set_xlim((args.min_rpkm * 1.5, args.max_rpkm * 1.25))

    ax.set_yscale('symlog')
    ax.set_xscale('symlog')

    ax.set_xlabel('RPKM')
    ax.set_ylabel('log BF')

    lgd = ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.5))

    if len(args.title) > 0:
        ax.set_title(args.title)

    fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd, ))
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently, creating one sparse matrix file (mtx) for "
        "each read length. These are then collected into a 'sparse tensor'.")

    parser.add_argument('config', help="The yaml config file.")
    parser.add_argument('name', help="The name of either one of the 'riboseq_samples'"
        "or 'riboseq_biological_replicates' from the config file.")
    
    parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses"
        "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written"
        "using the matrix market format (base-1 indexing).")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then 'name' will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    parser.add_argument('--add-ids', help="If this flag is present, "
        "then orf_ids will be added to the final output.", action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
 
    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)
    
    orfs = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=orf_note
    )

    exons = filenames.get_exons(
        config['genome_base_path'], 
        config['genome_name'],
        note=orf_note,
        is_orf=True
    )
    
    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # process one sample or all samples from condition
    names = [args.name]
    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # now the relevant files
        bam = filenames.get_riboseq_bam(
            config['riboseq_data'], 
            name, 
            is_unique=is_unique, 
            note=note
        )

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset],
                is_unique=is_unique, 
                note=note
            )

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam,
                orfs,
                exons,
                mtx,
                lengths_str,
                offsets_str,
                seqname_str,
                cpus_str,
                logging_str
            )
            
            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    add_ids_str = ""
    if args.add_ids:
        add_ids_str = "--add-ids"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config,
        args.name,
        args.out,
        is_condition_str,
        add_ids_str,
        logging_str
    )

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out',
                        help="The base output directory for the latex report")

    parser.add_argument(
        '--show-unfiltered-orfs',
        help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included",
        action='store_true')

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument('--uniprot',
                        help="The uniprot ORF lengths, if available",
                        default=default_uniprot)
    parser.add_argument('--uniprot-label',
                        help="The label to use for the uniprot ORFs in "
                        "the plot",
                        default=default_uniprot_label)

    parser.add_argument('--image-type',
                        help="The format of the image files. This must be "
                        "a format usable by matplotlib.",
                        default=default_image_type)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    parser.add_argument(
        '--show-chisq',
        help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.",
        action='store_true')

    parser.add_argument('-t',
                        '--tmp',
                        help="A location for temporary files",
                        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = ['riboseq_data', 'riboseq_samples']
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())

        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(
                ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config,
                    sample_name,
                    is_unique=is_unique,
                    default_params=metagene_options)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page
            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config,
                    sample_name,
                    is_unique=is_unique,
                    default_params=metagene_options)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)

            i = 0
            for sample_name in sample_names:

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        sample_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None

                caption = "ORF type metagene profiles: {}".format(
                    replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        replicate_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.debug(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)
Example #10
0
def _create_figures(name_pretty_name_is_replicate, config, args):
    """ This function creates all of the figures in the prediction report
        for the given dataset.
    """
    name, pretty_name, is_replicate = name_pretty_name_is_replicate

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    logging_str = logging_utils.get_logging_options_string(args)

    note_str = config.get('note', None)
    out_note_str = config.get('note', None)
    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    image_type_str = "--image-type {}".format(args.image_type)
    num_cpus_str = "--num-cpus {}".format(args.num_cpus)

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # if this is a replicate, we do not worry about lengths and offsets
    if is_replicate:
        lengths = None
        offsets = None
    else:
        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config,
                name,
                is_unique=is_unique,
                default_params=metagene_options)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                   "Skipping".format(name))
            logger.error(msg)
            return

    unsmoothed_profiles = filenames.get_riboseq_profiles(
        config['riboseq_data'],
        name,
        length=lengths,
        offset=offsets,
        is_unique=is_unique,
        note=note_str,
        is_smooth=False)

    msg = "{}: creating the ORF types bar charts".format(name)
    logger.debug(msg)

    it = itertools.product(grouped_values, chisq_values, filtered_values)

    for is_grouped, is_chisq, is_filtered in it:

        is_grouped_str = ""
        if is_grouped:
            is_grouped_str = ", Grouped"

        is_filtered_str = ""
        if is_filtered:
            is_filtered_str = ", Filtered"

        if is_chisq:
            title_str = "{}{}{}, Rp-$\chi^2$".format(pretty_name,
                                                     is_grouped_str,
                                                     is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = None
            rw = None

            orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                is_chisq=True,
                is_filtered=is_filtered)

        else:
            title_str = "{}{}{}, Rp-Bp".format(pretty_name, is_grouped_str,
                                               is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = fraction
            rw = reweighting_iterations
            orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                fraction=f,
                reweighting_iterations=rw,
                is_filtered=is_filtered)

        use_groups_str = ""
        if is_grouped:
            use_groups_str = "--use-groups"

        orf_types_bar_chart = filenames.get_orf_types_bar_chart(
            config['riboseq_data'],
            name,
            length=lengths,
            offset=offsets,
            is_unique=is_unique,
            note=out_note_str,
            image_type=args.image_type,
            fraction=f,
            reweighting_iterations=rw,
            is_grouped=is_grouped,
            is_chisq=is_chisq,
            is_filtered=is_filtered)

        cmd = "create-orf-types-bar-chart {} {} {} {}".format(
            orfs, orf_types_bar_chart, title_str, use_groups_str)

        in_files = [orfs]
        out_files = [orf_types_bar_chart]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

    msg = "{}: creating the ORF length distributions line graph".format(name)
    logger.debug(msg)

    uniprot_str = ""
    uniprot_label_str = ""
    if os.path.exists(args.uniprot):
        uniprot_str = "--uniprot {}".format(args.uniprot)
        uniprot_label_str = shlex.quote(args.uniprot_label)
        uniprot_label_str = "--uniprot-label {}".format(uniprot_label_str)

    for is_grouped in grouped_values:
        for is_chisq in chisq_values:

            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = None
                rw = None

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    is_chisq=True)

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    fraction=f,
                    reweighting_iterations=rw)

            use_groups_str = ""
            if is_grouped:
                use_groups_str = "--use-groups"

            orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str,
                image_type=args.image_type,
                fraction=f,
                reweighting_iterations=rw,
                is_grouped=is_grouped,
                is_chisq=is_chisq)

            cmd = (
                "create-orf-length-distribution-line-graph {} {} {} {} {} {}".
                format(orfs, orf_length_line_graph, title_str, use_groups_str,
                       uniprot_str, uniprot_label_str))

            in_files = [orfs]
            out_files = [orf_length_line_graph]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite)

    if args.show_orf_periodicity:
        msg = "{}: creating the ORF type metagene profiles".format(name)
        logger.debug(msg)

        for is_chisq in chisq_values:

            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)
                f = None
                rw = None
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    is_chisq=True,
                    is_filtered=is_filtered)

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    fraction=f,
                    reweighting_iterations=rw)

            orf_type_profile_base = filenames.get_orf_type_profile_base(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str,
                fraction=f,
                reweighting_iterations=rw,
                is_chisq=is_chisq)

            strand = "+"
            orf_type_profiles_forward = [
                filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                     orf_type, strand,
                                                     args.image_type)
                for orf_type in ribo_utils.orf_types
            ]

            strand = "-"
            orf_type_profiles_reverse = [
                filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                     orf_type, strand,
                                                     args.image_type)
                for orf_type in ribo_utils.orf_types
            ]

            cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".
                   format(orfs, profiles, orf_type_profile_base, title_str,
                          image_type_str, logging_str))

            in_files = [orfs]
            out_files = orf_type_profiles_forward + orf_type_profiles_reverse
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite)