コード例 #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                       args.num_cpus, get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    pandas_utils.write_df(res_df, args.out, index=False)
コード例 #2
0
def create_all_figures(config, args):

    # first, handle all of the regular datasets

    is_replicate = False
    sample_names = sorted(config['riboseq_samples'].keys())
    sample_name_map = ribo_utils.get_sample_name_map(config)
    samples = [(name, sample_name_map[name], is_replicate)
               for name in sample_names]

    is_replicate = True
    replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys())
    condition_name_map = ribo_utils.get_riboseq_condition_name_map(config)
    conditions = [(name, condition_name_map[name], is_replicate)
                  for name in replicate_names]

    all_names = samples + conditions
    parallel.apply_parallel_iter(all_names,
                                 args.num_cpus,
                                 _create_figures,
                                 config,
                                 args,
                                 progress_bar=True)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text."
    )
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument(
        '--show-read-length-bfs',
        help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.",
        action='store_true')

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument('--min-visualization-count',
                        help="Read lengths with fewer than this "
                        "number of reads will not be included in the report.",
                        type=int,
                        default=metagene_options['min_metagene_image_count'])

    parser.add_argument('--image-type',
                        help="The type of image types to create. This "
                        "must be an extension which matplotlib can interpret.",
                        default=default_image_type)

    parser.add_argument(
        '-c',
        '--create-fastqc-reports',
        help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.",
        action='store_true')

    parser.add_argument('--tmp',
                        help="If the fastqc reports are created, "
                        "they will use this location for temp files",
                        default=None)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    if args.note is not None:
        config['note'] = args.note
    note = config.get('note', None)

    sample_names = sorted(config['riboseq_samples'].keys())

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts', 'samtools',
        'visualize-read-filtering-counts', 'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc', 'java'])

    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create the read filtering information...
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(
            config['riboseq_data'], name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        'min_metagene_profile_count',
        metagene_options['min_metagene_profile_count'])

    min_bf_mean = config.get('min_metagene_bf_mean',
                             metagene_options['min_metagene_bf_mean'])

    max_bf_var = config.get('max_metagene_bf_var',
                            metagene_options['max_metagene_bf_var'])

    min_bf_likelihood = config.get(
        'min_metagene_bf_likelihood',
        metagene_options['min_metagene_bf_likelihood'])

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)

        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out,
                            read_filtering_caption,
                            label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out,
                      "Read length distributions",
                      label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out,
                           ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=False,
                note=note,
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=True,
                note=note,
                image_type=args.image_type)

            msg = "Looking for image file: {}".format(
                read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out,
                                     read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)

            msg = "Looking for image file: {}".format(
                unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out,
                                     unique_read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    unique_read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

        latex.end_table(out)
        latex.clearpage(out)

        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))

            periodic_offsets = filenames.get_periodic_offsets(
                config['riboseq_data'], name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used

                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"

                if max_bf_var is not None:
                    if ((length_row['highest_peak_bf_mean'] <= min_bf_mean) or
                            length_row['highest_peak_bf_var'] >= max_bf_var):
                        offset_status = "BF mean too small or BF var too high"

                if min_bf_likelihood is not None:
                    likelihood = 1 - scipy.stats.norm.cdf(
                        min_bf_mean, length_row['highest_peak_bf_mean'],
                        np.sqrt(length_row['highest_peak_bf_var']))
                    if likelihood <= min_bf_likelihood:
                        offset_status = "Likehood too small"

                if (max_bf_var is None) and (min_bf_likelihood is None):
                    if length_row['highest_peak_bf_mean'] <= min_bf_mean:
                        offset_status = "BF mean too small"

                if length_row[
                        'highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"

                if length_row[
                        'highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'],
                    name,
                    image_type=args.image_type,
                    is_unique=is_unique,
                    length=length,
                    note=note)

                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(
                    length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)

                i += 1
                if i % 2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)

                if args.show_read_length_bfs:

                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'],
                        name,
                        image_type=args.image_type,
                        is_unique=is_unique,
                        length=length,
                        note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)

                    i += 1
                    if i % 2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

            if i % 2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)

            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note,
                    subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base,
                            orf_type,
                            strand,
                            image_type=args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i % 4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i > 0) and (i % 4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                     args.num_cpus, create_fastqc_reports,
                                     config, args)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script constructs the profile for each ORF. It
        first adjusts the mapped read positions to properly align with the P-sites. Second, it uses 
        a custom chrom-sweep algorithm to find the coverage of each position in each exon of each ORF. 
        Finally, the ORF exons are glued together to find the profile of the entire ORF.""")
    
    parser.add_argument('bam', help="The bam file including filtered (unique, etc.) alignments")

    parser.add_argument('orfs', help="The (bed12) file containing the ORFs")

    parser.add_argument('exons', help="The (bed6+2) file containing the exons")

    parser.add_argument('out', help="The (mtx.gz) output file containing the ORF profiles")

    parser.add_argument('-l', '--lengths', help="""If any values are given, then only reads which have
        those lengths will be included in the signal construction.""",
                        type=int, default=[], nargs='*')

    parser.add_argument('-o', '--offsets', help="""The 5' end of reads will be shifted by this amount.
        There must be one offset value for each length (given by the --lengths argument.""",
                        type=int, default=[], nargs='*')
       
    parser.add_argument('-k', '--num-exons', help="If k>0, then only the first k exons will be processed.",
                        type=int, default=0)

    parser.add_argument('-g', '--num-groups', help=""""The number of groups into which to split the exons. 
        More groups means the progress bar is updated more frequently but incurs more overhead because of the
        parallel calls.""", type=int, default=default_num_groups)

    parser.add_argument('--seqname-prefix', help="""If present, this string will be prepended to the 
        seqname field of the ORFs.""", default='')
        
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)
    
    # make sure the number of lengths and offsets match
    if len(args.lengths) != len(args.offsets):
        msg = "The number of --lengths and --offsets do not match."
        raise ValueError(msg)

    # make sure the necessary files exist
    required_files = [args.bam, args.orfs, args.exons]
    msg = "[extract-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Finding P-sites"
    logger.info(msg)

    p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets)

    # we do not need the data frame anymore, so save some memory
    msg = "Reading exons"
    logger.info(msg)
    exons = bed_utils.read_bed(args.exons)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bed_utils.read_bed(args.orfs)

    if len(args.seqname_prefix) > 0:
        orfs['seqname'] = args.seqname_prefix + orfs['seqname']
        exons['seqname'] = args.seqname_prefix + exons['seqname']

    if args.num_exons > 0:
        exons = exons.head(args.num_exons)

    num_orfs = orfs['orf_num'].max() + 1
    max_orf_len = orfs['orf_len'].max()

    msg = "Adding the ORF index to the exons"
    logger.info(msg)

    orf_fields = ['id', 'orf_num']
    exons_orfs = exons.merge(orfs[orf_fields], on='id')

    msg = "Splitting exons and P-sites"
    logger.info(msg)
    exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups)

    exons_dfs = []
    psites_dfs = []

    for group_index, exon_group in exon_groups:
        # pull out only the p-sites that come from these chromosomes
        seqnames = set(exon_group['seqname'].unique())
        m_psites = p_sites['seqname'].isin(seqnames)
        
        exons_dfs.append(exon_group)
        psites_dfs.append(p_sites[m_psites])

    # we no longer need the full list of psites
    del p_sites
    del exons_orfs
    del exon_groups
    del exons
    gc.collect()
    exons_psites = zip(exons_dfs, psites_dfs)
     
    msg = "Finding all P-site intersections"
    logger.info(msg)

    sum_profiles = parallel.apply_parallel_iter(
        exons_psites,
        args.num_cpus,
        get_all_p_site_intersections,
        num_orfs,
        max_orf_len,
        progress_bar=True,
        total=len(exons_dfs),
        backend='multiprocessing'
    )

    msg = "Combining the ORF profiles into one matrix"
    logger.info(msg)
        
    f = lambda x, y: x+y

    sum_profiles = functools.reduce(f, sum_profiles)
    sum_profiles_lil = sum_profiles.tolil()

    msg = "Flipping the reverse strand profiles"
    logger.info(msg)

    m_reverse = orfs['strand'] == '-'
    reverse_orfs = orfs[m_reverse]

    for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()):
        orf_num = reverse_orf['orf_num']

        if sum_profiles[orf_num].sum() == 0:
            continue

        orf_len = reverse_orf['orf_len']
        dense = utils.to_dense(sum_profiles, orf_num, length=orf_len)
        dense = dense[::-1]
        sum_profiles_lil[orf_num, :orf_len] = dense

    msg = "Writing the sparse matrix to disk"
    logger.info(msg)
    math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Extract the ORFs from the given transcripts and
        write as a BED12+ file. Additional fields, 'orf_len' and 'orf_num', give the length 
        of each ORF and it's index (used to write the ORF profiles). A third additional field
        records duplicated ORFs from transcript variants.''')

    parser.add_argument('transcripts_bed',
                        help='''The BED12 file containing the 
        transcript information.''')

    parser.add_argument('transcripts_fasta',
                        help='''The fasta file containing the 
        spliced transcript sequences.''')

    parser.add_argument('out', help='''The output (BED12+ gz) file.''')

    parser.add_argument('--start-codons',
                        help='''A list of codons which will be treated 
        as start codons when extracting the ORFs.''',
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help='''A list of codons which will be treated 
        as stop codons when extracting the ORFs.''',
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)
    orfs.reset_index(drop=True, inplace=True)

    #  This is done arbitrarily, however we keep all matching
    #  transcripts for reference
    msg = "Marking and removing duplicate ORFs"
    logger.info(msg)

    groupby_duplicates = orfs.groupby(DUPLICATE_FIELDS,
                                      as_index=False).agg({'id': ','.join})
    orfs = orfs.merge(groupby_duplicates, how='left', on=DUPLICATE_FIELDS)
    orfs.drop_duplicates(subset=DUPLICATE_FIELDS, inplace=True, keep='first')
    orfs.rename(columns={'id_x': 'id', 'id_y': 'duplicates'}, inplace=True)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)