def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script visualizes the metagene profiles for each ORF type "
        "present in a given BED12+ file. It visualizes the mean and variance of normalized "
        "profiles in the first 21-bp, last 21-bp, and across all other 21-bp windows."
    )

    parser.add_argument('orfs', help="The BED12+ file containing the ORFs")
    parser.add_argument('profiles',
                        help="The (mtx) file containing the ORF profiles")
    parser.add_argument(
        'out',
        help="The base output name. The output filenames will be of "
        "the form: <out>.<orf-type>.<image-type>.")

    parser.add_argument('--min-profile',
                        help="The minimum value of the sum over the profile "
                        "to include it in the analysis",
                        type=float,
                        default=default_min_profile)

    parser.add_argument(
        '--max-orfs',
        help="At most this many ORFs of each type will be "
        "used to create the figures. They will be sampled randomly from among those "
        "which meet the min-profile constraint.",
        type=int,
        default=default_max_orfs)

    parser.add_argument('--title',
                        help="The prefix to use for the title of the plots",
                        default=default_title)

    parser.add_argument('--image-type',
                        help="The type of image files to create. The type "
                        "must be recognized by matplotlib.",
                        default=default_image_type)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading ORFs"
    logger.info(msg)
    orfs = bed_utils.read_bed(args.orfs)

    msg = "Reading profiles"
    logger.info(msg)
    profiles = scipy.io.mmread(args.profiles).tocsr()

    msg = "Extracting the metagene profiles and creating the images"
    logger.info(msg)

    orf_type_groups = orfs.groupby('orf_type')
    orf_type_groups.apply(extract_profiles_and_plot, profiles, args)

    msg = "Finished"
    logger.info(msg)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                       args.num_cpus, get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    pandas_utils.write_df(res_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Convert featureCount output to BED12 with exon-union coordinates at
        meta-feature level.""")

    parser.add_argument('tsv', help="The featureCount tsv file")
    parser.add_argument('out', help="The (output) BED12 file, compressed by default")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=12)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading featureCount tsv file"
    logger.info(msg)
    
    tsv = pd.read_csv(args.tsv, 
                      usecols=['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length'], 
                      sep='\t', 
                      comment='#')
    
    msg = "Merging..."
    logger.info(msg)
    merged = parallel.apply_parallel(tsv, args.num_cpus, merge_gene_group)  
    merged = pd.DataFrame(merged)
    
    msg = "Sorting..."
    logger.info(msg)
    # We will break ties among transcripts by the order they appear 
    # in the GTF file. This is the same way star breaks ties.
    merged = bed_utils.sort(merged)

    msg = "Writing BED12 to disk"
    logger.info(msg)
    
    fields = bed_utils.bed12_field_names
    fields.append('length')
    bed_utils.write_bed(merged[fields], args.out)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This script adds the ORF profiles from a set
        of profiles (presumably, each file corresponds to one replicate from a condition). 
        The script keeps the profiles in sparse matrix format, so it is fairly efficient."""
    )

    parser.add_argument('profiles',
                        help="The (mtx) files containing the ORF profiles",
                        nargs='+')

    parser.add_argument(
        'out', help="The (mtx.gz) output file containing the merged profiles")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading first ORF profile"
    logger.info(msg)

    merged_profiles = scipy.io.mmread(args.profiles[0]).tocsr()

    msg = "Adding each additional profile"
    logger.info(msg)

    for profile_file in args.profiles[1:]:
        msg = "Reading file: {}".format(profile_file)
        logger.info(msg)

        profiles = scipy.io.mmread(profile_file).tocsr()
        merged_profiles = merged_profiles + profiles

    msg = "Writing merged profiles to disk"
    logger.info(msg)

    math_utils.write_sparse_matrix(args.out, merged_profiles)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script constructs the profile for each ORF. It
        first adjusts the mapped read positions to properly align with the P-sites. Second, it uses 
        a custom chrom-sweep algorithm to find the coverage of each position in each exon of each ORF. 
        Finally, the ORF exons are glued together to find the profile of the entire ORF.""")
    
    parser.add_argument('bam', help="The bam file including filtered (unique, etc.) alignments")

    parser.add_argument('orfs', help="The (bed12) file containing the ORFs")

    parser.add_argument('exons', help="The (bed6+2) file containing the exons")

    parser.add_argument('out', help="The (mtx.gz) output file containing the ORF profiles")

    parser.add_argument('-l', '--lengths', help="""If any values are given, then only reads which have
        those lengths will be included in the signal construction.""",
                        type=int, default=[], nargs='*')

    parser.add_argument('-o', '--offsets', help="""The 5' end of reads will be shifted by this amount.
        There must be one offset value for each length (given by the --lengths argument.""",
                        type=int, default=[], nargs='*')
       
    parser.add_argument('-k', '--num-exons', help="If k>0, then only the first k exons will be processed.",
                        type=int, default=0)

    parser.add_argument('-g', '--num-groups', help=""""The number of groups into which to split the exons. 
        More groups means the progress bar is updated more frequently but incurs more overhead because of the
        parallel calls.""", type=int, default=default_num_groups)

    parser.add_argument('--seqname-prefix', help="""If present, this string will be prepended to the 
        seqname field of the ORFs.""", default='')
        
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)
    
    # make sure the number of lengths and offsets match
    if len(args.lengths) != len(args.offsets):
        msg = "The number of --lengths and --offsets do not match."
        raise ValueError(msg)

    # make sure the necessary files exist
    required_files = [args.bam, args.orfs, args.exons]
    msg = "[extract-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Finding P-sites"
    logger.info(msg)

    p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets)

    # we do not need the data frame anymore, so save some memory
    msg = "Reading exons"
    logger.info(msg)
    exons = bed_utils.read_bed(args.exons)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bed_utils.read_bed(args.orfs)

    if len(args.seqname_prefix) > 0:
        orfs['seqname'] = args.seqname_prefix + orfs['seqname']
        exons['seqname'] = args.seqname_prefix + exons['seqname']

    if args.num_exons > 0:
        exons = exons.head(args.num_exons)

    num_orfs = orfs['orf_num'].max() + 1
    max_orf_len = orfs['orf_len'].max()

    msg = "Adding the ORF index to the exons"
    logger.info(msg)

    orf_fields = ['id', 'orf_num']
    exons_orfs = exons.merge(orfs[orf_fields], on='id')

    msg = "Splitting exons and P-sites"
    logger.info(msg)
    exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups)

    exons_dfs = []
    psites_dfs = []

    for group_index, exon_group in exon_groups:
        # pull out only the p-sites that come from these chromosomes
        seqnames = set(exon_group['seqname'].unique())
        m_psites = p_sites['seqname'].isin(seqnames)
        
        exons_dfs.append(exon_group)
        psites_dfs.append(p_sites[m_psites])

    # we no longer need the full list of psites
    del p_sites
    del exons_orfs
    del exon_groups
    del exons
    gc.collect()
    exons_psites = zip(exons_dfs, psites_dfs)
     
    msg = "Finding all P-site intersections"
    logger.info(msg)

    sum_profiles = parallel.apply_parallel_iter(
        exons_psites,
        args.num_cpus,
        get_all_p_site_intersections,
        num_orfs,
        max_orf_len,
        progress_bar=True,
        total=len(exons_dfs),
        backend='multiprocessing'
    )

    msg = "Combining the ORF profiles into one matrix"
    logger.info(msg)
        
    f = lambda x, y: x+y

    sum_profiles = functools.reduce(f, sum_profiles)
    sum_profiles_lil = sum_profiles.tolil()

    msg = "Flipping the reverse strand profiles"
    logger.info(msg)

    m_reverse = orfs['strand'] == '-'
    reverse_orfs = orfs[m_reverse]

    for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()):
        orf_num = reverse_orf['orf_num']

        if sum_profiles[orf_num].sum() == 0:
            continue

        orf_len = reverse_orf['orf_len']
        dense = utils.to_dense(sum_profiles, orf_num, length=orf_len)
        dense = dense[::-1]
        sum_profiles_lil[orf_num, :orf_len] = dense

    msg = "Writing the sparse matrix to disk"
    logger.info(msg)
    math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
Ejemplo n.º 6
0
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This script uses Hamiltonian MCMC with Stan 
        to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes 
        as input: (1) a set of regions (ORFs) and their corresponding profiles
                  (2) a "translated" model which gives the probability that a region is translated
                  (3) an "untranslated" model which gives the probability that a region is not translated.
        The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor 
        (using the smoothed profile) and chi2 value (using the raw counts) for each ORF."""
    )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")

    parser.add_argument(
        'regions',
        help="The regions (ORFs) for which predictions will be made (BED12+)")

    parser.add_argument('out',
                        help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only',
                        help="""If this flag is present, then only the chi
        square test will be performed for each ORF. This can also be a way to get the counts within 
        each of the ORFs.""",
                        action='store_true')

    parser.add_argument('--translated-models',
                        help="The models to use as H_t (pkl)",
                        nargs='+')

    parser.add_argument('--untranslated-models',
                        help="The models to use as H_u (pkl)",
                        nargs='+')

    # filtering options
    parser.add_argument(
        '--orf-types',
        help=
        "If values are given, then only orfs with those types are processed.",
        nargs='*',
        default=translation_options['orf_types'])

    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument(
        '--min-length',
        help="ORFs with length less than this value will not be processed",
        type=int,
        default=translation_options['orf_min_length_pre'])

    parser.add_argument(
        '--max-length',
        help="ORFs with length greater than this value will not be processed",
        type=int,
        default=translation_options['orf_max_length_pre'])

    parser.add_argument(
        '--min-profile',
        help="""ORFs with profile sum (i.e., number of reads) less than this
        value will not be processed.""",
        type=float,
        default=translation_options['orf_min_profile_count_pre'])

    # smoothing options
    parser.add_argument('--fraction',
                        help="The fraction of signal to use in LOWESS",
                        type=float,
                        default=translation_options['smoothing_fraction'])

    parser.add_argument(
        '--reweighting-iterations',
        help="The number of reweighting "
        "iterations to use in LOWESS. "
        "Please see the statsmodels documentation for a "
        "detailed description of this parameter.",
        type=int,
        default=translation_options['smoothing_reweighting_iterations'])

    # MCMC options
    parser.add_argument('-s',
                        '--seed',
                        help="The random seeds to use for inference",
                        type=int,
                        default=translation_options['seed'])
    parser.add_argument('-c',
                        '--chains',
                        help="The number of MCMC chains to use",
                        type=int,
                        default=translation_options['chains'])
    parser.add_argument(
        '-i',
        '--iterations',
        help="The number of MCMC iterations to use for each chain",
        type=int,
        default=translation_options['translation_iterations'])

    # behavior options
    parser.add_argument(
        '--num-orfs',
        help="If n>0, then only this many ORFs will be processed",
        type=int,
        default=0)

    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress',
                        help="Unless otherwise specified, the output will "
                        "be written in GZip format",
                        action='store_true')

    parser.add_argument('-g',
                        '--num-groups',
                        help="The number of groups into which to split "
                        "the ORFs. More groups means the progress bar is "
                        "updated more frequently but incurs more overhead "
                        "because of the parallel calls.",
                        type=int,
                        default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]

    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [
        pickle.load(open(tm, 'rb')) for tm in args.translated_models
    ]
    untranslated_models = [
        pickle.load(open(bm, 'rb')) for bm in args.untranslated_models
    ]

    profiles_data = multiprocessing.RawArray(ctypes.c_double,
                                             profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)

    with suppress_stdout_stderr():

        bfs_l = parallel.apply_parallel_split(regions,
                                              args.num_cpus,
                                              get_all_bayes_factors_args,
                                              num_groups=args.num_groups,
                                              progress_bar=True,
                                              backend='multiprocessing')

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Collect the individual read length ORF profiles (mtx) created "
        "by 'create-read-length-orf-profiles' into a single 'sparse tensor'. "
        "N.B. This script is called by 'create-read-length-orf-profiles', however"
        "we still call each sample independently for condition, lengths and offsets")
    
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name of either one of the 'riboseq_samples'"
        "or 'riboseq_biological_replicates' from the config file.")
    
    parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses"
        "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written"
        "using the matrix market format (base-1 indexing).")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then 'name' will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    parser.add_argument('--add-ids', help="If this flag is present, "
        "then orf_ids will be added to the final output.", action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)
    note = config.get('note', None)

    if args.add_ids:
        orf_note = config.get('orf_note', None)
        orfs_file = filenames.get_orfs(
            config['genome_base_path'],
            config['genome_name'],
            note=orf_note
        )
        orfs = bed_utils.read_bed(orfs_file)

    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    # keep a map from the lengths to the combined profiles
    length_profile_map = {}

    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique,
            default_params=metagene_options
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
                        
            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset], 
                is_unique=is_unique, 
                note=note
            )

            mtx = scipy.io.mmread(mtx).tocsr()

            prior_mtx = length_profile_map.get(length, None)

            if prior_mtx is None:
                length_profile_map[length] = mtx
            else:
                length_profile_map[length] = prior_mtx + mtx

    if args.add_ids:
        with gzip.open(args.out, 'wb') as target_gz:

            for length, mtx in length_profile_map.items():
                mtx = mtx.tocoo()

                msg = "Writing ORF profiles. length: {}.".format(length)
                logger.info(msg)

                for row, col, val in zip(mtx.row, mtx.col, mtx.data):
                    # orf_num are both zero-based, since we are now using coo
                    orf_id = orfs.loc[orfs['orf_num'] == row]['id'].values[0]
                    s = "{} {} {} {} {}\n".format(row, orf_id, col, length, val)
                    target_gz.write(s.encode())
    else:
        with gzip.open(args.out, 'wb') as target_gz:

            for length, mtx in length_profile_map.items():
                mtx = mtx.tocoo()

                msg = "Writing ORF profiles. length: {}.".format(length)
                logger.info(msg)

                for row, col, val in zip(mtx.row, mtx.col, mtx.data):
                    s = "{} {} {} {}\n".format(row, col, length, val)
                    target_gz.write(s.encode())
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="Creates base genome profile.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)
    
    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('-t', '--tmp', help="""The location for temporary files. If not
        specified, program-specific temp locations are used.""", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files
        will be overwritten.""", action='store_true')

    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'remove-multimapping-reads'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    call = not args.do_not_call
    keep_delete_files = args.keep_intermediate_files or args.do_not_call

    # Step 0: Running flexbar to remove adapter sequences

    raw_data = args.raw_data
    flexbar_target = filenames.get_without_adapters_base(config['riboseq_data'],
                                                         args.name,
                                                         note=note)
    without_adapters = filenames.get_without_adapters_fastq(config['riboseq_data'],
                                                            args.name,
                                                            note=note)

    adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence', 'adapter-seq')
    adapter_file_str = utils.get_config_argument(config, 'adapter_file', 'adapters')

    # get all options, command line options override defaults
    flexbar_option_str = pgrm_utils.get_final_args(flexbar_options, args.flexbar_options)

    cmd = "flexbar -r {} -t {} {} {} {} -n {}".format(raw_data,
                                                      flexbar_target,
                                                      adapter_seq_str,
                                                      adapter_file_str,
                                                      flexbar_option_str,
                                                      args.num_cpus)
    in_files = [raw_data]
    out_files = [without_adapters]
    file_checkers = {
        without_adapters: fastx_utils.check_fastq_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # Step 1: Running bowtie2 to remove rRNA alignments

    out = utils.abspath("dev", "null")  # we do not care about the alignments
    without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'],
                                                    args.name,
                                                    note=note)
    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              args.name,
                                              note=note)

    cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format(
        args.num_cpus,
        config['ribosomal_index'],
        without_adapters,
        out,
        without_rrna,
        with_rrna)

    in_files = [without_adapters]
    in_files.extend(pgrm_utils.get_bowtie2_index_files(config['ribosomal_index']))
    out_files = [without_rrna, with_rrna]
    to_delete = [without_adapters]
    file_checkers = {
        without_rrna: fastx_utils.check_fastq_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite, call=call,
                                   keep_delete_files=keep_delete_files, to_delete=to_delete)

    # Step 2: Running STAR to align rRNA-depleted reads to genome

    star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'],
                                                        args.name,
                                                        note=note)
    genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam")

    # get all options, command line options override defaults

    mem_bytes = utils.human2bytes(args.mem)
    star_options['limitBAMsortRAM'] = mem_bytes

    if args.tmp is not None:
        star_tmp_name = str(args.name + "_STARtmp")
        star_tmp_dir = pgrm_utils.create_star_tmp(args.tmp, star_tmp_name)
        star_options['outTmpDir'] = star_tmp_dir

    star_option_str = pgrm_utils.get_final_args(star_options, args.star_options)

    # If GFF3 specs, then we need to inform STAR.
    # Whether we have de novo or not, the format of "config['gtf']" has precedence.
    sjdb_gtf_tag_str = ""
    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)
    if use_gff3_specs:
        sjdb_gtf_tag_str = "--sjdbGTFtagExonParentTranscript Parent"

    cmd = ("{} --runThreadN {} --genomeDir {} --sjdbGTFfile {} {} --readFilesIn {} "
        "{} --outFileNamePrefix {}".format(args.star_executable,
                                                 args.num_cpus,
                                                 config['star_index'],
                                                 gtf_file,
                                                 sjdb_gtf_tag_str,
                                                 without_rrna,
                                                 star_option_str,
                                                 star_output_prefix))
    in_files = [without_rrna]
    in_files.extend(pgrm_utils.get_star_index_files(config['star_index']))
    to_delete = [without_rrna]
    out_files = [genome_star_bam]
    file_checkers = {
        genome_star_bam: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite,
                                   call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
    
    # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline
    genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                                  args.name,
                                                  note=note)

    if os.path.exists(genome_star_bam):
        shell_utils.create_symlink(genome_star_bam, genome_sorted_bam, call)
    else:
        msg = ("Could not find the STAR genome bam alignment file. Unless "
               "--do-not-call was given, this is a problem.")
        logger.warning(msg)

    # create the bamtools index
    cmd = "samtools index -b {}".format(genome_sorted_bam)
    shell_utils.check_call(cmd, call=call)

    # check if we want to keep multimappers
    if 'keep_riboseq_multimappers' in config:
        return

    # remove multimapping reads from the genome file
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                       args.name,
                                                       is_unique=True,
                                                       note=note)

    cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam, 
                                                      unique_genome_filename,
                                                      tmp_str)

    in_files = [genome_sorted_bam]
    out_files = [unique_genome_filename]
    to_delete = [genome_star_bam, genome_sorted_bam]
    file_checkers = {
        unique_genome_filename: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite,
                                   call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Label the ORFs based on their transcript
        exon structure wrt the annotated transcripts.''')

    parser.add_argument('annotated_transcripts',
                        help='''The annotated transcripts for the genome
        in BED12+ format.''')

    parser.add_argument('extracted_orfs',
                        help='''The ORFs extracted from the transcripts 
        in BED12+ format.''')

    parser.add_argument('out', help='''The output (BED12+.gz) file.''')

    parser.add_argument('-e',
                        '--annotated-exons',
                        help='''The annotated transcript 
        exons can be passed with this option. If they are not given, they will be 
        split from the annotated transcripts.''',
                        default=None)

    parser.add_argument('-o',
                        '--orf-exons',
                        help='''The exon blocks for the ORFs, in BED6+ format, 
        obtained from "split-bed12-blocks". If they are not given, they will be split from the
        extracted ORFs.''',
                        default=None)

    parser.add_argument('-n',
                        '--nonoverlapping-label',
                        help='''If this option is given, 
        then the ORFs which do not overlap the annotated transcripts at all will be given this label.
        By default, remaining oof overlapping ORFs are assigned the "overlap" label.
        If not given, the ORFs outside of annotated regions are labeled as "suspect".''',
                        default=None)

    parser.add_argument('-l',
                        '--label-prefix',
                        help='''This string is prepended to all labels 
        assigned to ORFs, e.g. to indicate ORFs from a de novo assembly (Rp-Bp assigns the label
        "novel" to these, however the string is not prepended to "canonical ORFs").''',
                        default='')

    parser.add_argument('-f',
                        '--filter',
                        help='''If this flag is given, then ORFs
        which are completely covered by an annotated transcript are discarded. Use to filter 
        uninteresting ORFs from a de novo assembly.''',
                        action='store_true')

    parser.add_argument('-p',
                        '--num-cpus',
                        help='''The number of CPUs to use to perform
            BED operations.''',
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading annotated transcripts"
    logger.info(msg)
    annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts)

    # get the annotated transcript exons
    if args.annotated_exons is None:
        msg = "Splitting the annotated transcripts into exon blocks"
        logger.info(msg)

        annotated_exons = bed_utils.split_bed12(annotated_transcripts,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)
    else:
        msg = "Reading the annotated transcript exons"
        logger.info(msg)

        annotated_exons = bed_utils.read_bed(args.annotated_exons)

    msg = "Reading extracted ORFs"
    logger.info(msg)
    extracted_orfs = bed_utils.read_bed(args.extracted_orfs)

    if args.orf_exons is None:
        msg = "Splitting the extracted ORFs into exon blocks"
        logger.info(msg)
        extracted_orf_exons = bed_utils.split_bed12(extracted_orfs,
                                                    num_cpus=args.num_cpus,
                                                    progress_bar=True)
    else:
        msg = "Reading the extracted ORFs exons"
        logger.info(msg)
        extracted_orf_exons = bed_utils.read_bed(args.orf_exons)

    msg = "Found {} extracted ORFs with {} exons".format(
        len(extracted_orfs), len(extracted_orf_exons))
    logger.debug(msg)

    # filter out the ORFs that are entirely within annotated transcripts
    if args.filter:
        msg = "Removing ORFs which are completely covered by the annotated transcripts"
        logger.info(msg)

        nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons,
                                                    annotated_exons,
                                                    min_a_overlap=1)
        m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs = extracted_orfs[m_unfiltered]
        # discard the unnecessary exons
        m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[m_unfiltered]

        msg = "After filtering, {} extracted ORFs remain".format(
            len(extracted_orfs))
        logger.info(msg)

    # annotate and remove the ORFs which do not at all overlap the annotations
    if args.nonoverlapping_label is not None:
        nonoverlapping_ids = bed_utils.subtract_bed(
            extracted_orfs,
            annotated_transcripts,
            exons_a=extracted_orf_exons,
            exons_b=annotated_exons)
        m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping]
        m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs.loc[m_nonoverlapping,
                           'orf_type'] = args.nonoverlapping_label

        msg = ("Found {} ORFs completely non-overlapping annotated transcripts"
               .format(len(nonoverlapping_ids)))
        logger.info(msg)

    msg = "Removing the annotated UTRs from the transcripts"
    logger.info(msg)
    canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts,
                                                     num_cpus=args.num_cpus)

    msg = "Splitting the canonical ORFs into exons"
    logger.info(msg)
    canonical_orf_exons = bed_utils.split_bed12(canonical_orfs,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)

    msg = "Extracting annotated 5' leader regions"
    logger.info(msg)
    five_prime_regions = bed_utils.retain_all_five_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(five_prime_regions) == 0:
        msg = "No annotated 5' leader regions were found"
        logger.warning(msg)

    msg = "Splitting the 5' leaders into exons"
    logger.info(msg)
    five_prime_exons = bed_utils.split_bed12(five_prime_regions,
                                             num_cpus=args.num_cpus,
                                             progress_bar=True)

    msg = "Extracting annotated 3' trailer regions"
    logger.info(msg)
    three_prime_regions = bed_utils.retain_all_three_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(three_prime_regions) == 0:
        msg = "No annotated 3' trailer regions were found"
        logger.warning(msg)

    msg = "Splitting the 3' trailers into exons"
    logger.info(msg)
    three_prime_exons = bed_utils.split_bed12(three_prime_regions,
                                              num_cpus=args.num_cpus,
                                              progress_bar=True)

    msg = "Splitting non-coding transcripts into exons"
    logger.info(msg)

    m_no_thick_start = annotated_transcripts['thick_start'] == -1
    m_no_thick_end = annotated_transcripts['thick_end'] == -1
    m_no_thick = m_no_thick_start & m_no_thick_end
    noncoding_transcripts = annotated_transcripts[m_no_thick]
    noncoding_exons = bed_utils.split_bed12(noncoding_transcripts,
                                            num_cpus=args.num_cpus,
                                            progress_bar=True)

    # First, remove all in-frame (canonical, canonical variants), and also within and oof ORFs

    msg = "Marking canonical and extracted ORFs with the same stop codon"
    logger.info(msg)

    # first, add the "true" ORF end
    m_reverse_canonical = canonical_orfs['strand'] == '-'
    canonical_orfs['orf_end'] = canonical_orfs['end']
    canonical_orfs.loc[m_reverse_canonical,
                       'orf_end'] = canonical_orfs.loc[m_reverse_canonical,
                                                       'start']

    m_reverse_extracted = extracted_orfs['strand'] == '-'
    extracted_orfs['orf_end'] = extracted_orfs['end']
    extracted_orfs.loc[m_reverse_extracted,
                       'orf_end'] = extracted_orfs.loc[m_reverse_extracted,
                                                       'start']

    # then, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs
    merge_fields = ['seqname', 'strand', 'orf_end']
    canonical_extracted_orf_ends = canonical_orfs.merge(
        extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted'])

    # finally, pull this into a set
    zip_it = zip(canonical_extracted_orf_ends['id_canonical'],
                 canonical_extracted_orf_ends['id_extracted'])
    canonical_extracted_matching_ends = {(c, a) for c, a in zip_it}

    msg = "Finding ORFs which exactly overlap the canonical ORFs"
    logger.info(msg)

    exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                               extracted_orf_exons,
                                               min_a_overlap=1,
                                               min_b_overlap=1)

    exact_match_orf_ids = {m.b_info for m in exact_matches}

    m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids)
    extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches]

    m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids)
    label = 'canonical'
    extracted_orfs.loc[m_canonical, 'orf_type'] = label

    msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids))
    logger.info(msg)

    msg = "Finding truncated canonical ORFs"
    logger.info(msg)

    truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    truncated_match_ids = {
        m.b_info
        for m in truncated_matches
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_truncated_matches]

    m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids)

    msg = "Finding extended canonical ORFs"
    logger.info(msg)

    extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                  extracted_orf_exons,
                                                  min_a_overlap=1)

    # For standard assembly, we also need to make sure that
    # all extended matches are fully contained within the
    # transcript structure (i.e start upstream but otherwise
    # have the same structure).
    if args.nonoverlapping_label is None:

        transcript_matches = bed_utils.get_bed_overlaps(annotated_exons,
                                                        extracted_orf_exons,
                                                        min_b_overlap=1)
        transcript_match_pairs = {(m.a_info, m.b_info)
                                  for m in transcript_matches}

        extended_match_ids = {
            m.b_info
            for m in extended_matches
            if (m.a_info, m.b_info) in transcript_match_pairs and (
                m.a_info, m.b_info) in canonical_extracted_matching_ends
        }

    else:

        extended_match_ids = {
            m.b_info
            for m in extended_matches
            if (m.a_info, m.b_info) in canonical_extracted_matching_ends
        }

    m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_extended_matches]

    m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids)
    m_canonical_variants = m_canonical_truncated | m_canonical_extended

    label = "{}canonical_variant".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_variants, 'orf_type'] = label

    msg = "Found {} canonical_variant ORFs".\
          format(len(extended_match_ids | truncated_match_ids))
    logger.info(msg)

    msg = ("Finding within canonical ORFs that do not share an "
           "annotated stop codon with a canonical ORF (e.g. in "
           "frame stop, out-of-frame)")
    logger.info(msg)

    within_ids = {
        m.b_info
        for m in truncated_matches if m.b_info not in truncated_match_ids
    }

    m_within_matches = extracted_orf_exons['id'].isin(within_ids)
    extracted_orf_exons = extracted_orf_exons[~m_within_matches]

    m_within = extracted_orfs['id'].isin(within_ids)
    label = "{}within".format(args.label_prefix)
    extracted_orfs.loc[m_within, 'orf_type'] = label

    msg = "Found {} within ORFs".format(len(within_ids))
    logger.info(msg)

    # find all overlapping ORFs
    msg = "Finding all UTR overlap matches"
    logger.info(msg)
    out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                      extracted_orf_exons)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons)

    msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a "
           "canonical ORF and annotated leaders or trailers")
    logger.info(msg)

    # We need to choose how to ensure that up-/downstream overlaps are unique.
    # Where an ORF overlaps both the 5'UTR and the 3'UTR of different same
    # sense overlapping transcripts, it is assigned by default to the downstream overlap.
    # For de novo, everything is labeled as overlap.

    leader_match_pairs = {(m.a_info, m.b_info) for m in leader_matches}
    trailer_match_pairs = {(m.a_info, m.b_info) for m in trailer_matches}

    if args.nonoverlapping_label is None:

        # For standard assembly, we also need to make sure that
        # all overlap matches are fully contained within the
        # transcript structure.
        transcript_matches = bed_utils.get_bed_overlaps(annotated_exons,
                                                        extracted_orf_exons,
                                                        min_b_overlap=1)

        transcript_match_pairs = {(m.a_info, m.b_info)
                                  for m in transcript_matches}

        leader_overlap_pairs = {
            (m.a_info, m.b_info)
            for m in out_of_frame_matches
            if (m.a_info, m.b_info) in leader_match_pairs and (
                m.a_info, m.b_info) not in trailer_match_pairs and (
                    m.a_info, m.b_info) in transcript_match_pairs
        }

        trailer_overlap_pairs = {
            (m.a_info, m.b_info)
            for m in out_of_frame_matches
            if (m.a_info, m.b_info) in trailer_match_pairs and (
                m.a_info, m.b_info) not in leader_match_pairs and (
                    m.a_info, m.b_info) in transcript_match_pairs
        }

        # We do not assign preference where the ORF overlaps both sides
        # of the coding sequence on the same transcript, any ORF
        # satisfying both will be labeled simply as overlap.
        overlap_ids = {
            m.b_info
            for m in out_of_frame_matches
            if (m.a_info, m.b_info) in leader_match_pairs and (
                m.a_info, m.b_info) in trailer_match_pairs and (
                    m.a_info, m.b_info) in transcript_match_pairs
        }

        trailer_overlap_ids = {
            pair[1]
            for pair in trailer_overlap_pairs if pair[1] not in overlap_ids
        }

        leader_overlap_ids = {
            pair[1]
            for pair in leader_overlap_pairs if
            pair[1] not in trailer_overlap_ids and pair[1] not in overlap_ids
        }

        m_overlap_matches = extracted_orf_exons['id'].isin(overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_overlap_matches]

        m_leader_overlap_matches = extracted_orf_exons['id'].isin(
            leader_overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches]

        m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids)
        label = "{}five_prime_overlap".format(args.label_prefix)
        extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = label

        m_trailer_overlap_matches = extracted_orf_exons['id'].isin(
            trailer_overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches]

        m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids)
        label = "{}three_prime_overlap".format(args.label_prefix)
        extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = label

        msg = "Found {} five_prime_overlap ORFs".format(
            len(leader_overlap_ids))
        logger.info(msg)
        msg = "Found {} three_prime_overlap ORFs".format(
            len(trailer_overlap_ids))
        logger.info(msg)

    else:

        overlap_ids = {m.b_info for m in out_of_frame_matches}
        overlap_ids |= {m.b_info for m in leader_matches}
        overlap_ids |= {m.b_info for m in trailer_matches}

        m_overlap_matches = extracted_orf_exons['id'].isin(overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_overlap_matches]

    m_overlap = extracted_orfs['id'].isin(overlap_ids)
    label = "{}overlap".format(args.label_prefix)
    extracted_orfs.loc[m_overlap, 'orf_type'] = label

    msg = "Found {} overlap ORFs".format(len(overlap_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 5' or 3' leaders"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons,
                                                min_b_overlap=1)

    leader_ids = {m.b_info for m in leader_matches}

    m_leader_matches = extracted_orf_exons['id'].isin(leader_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_matches]

    m_five_prime = extracted_orfs['id'].isin(leader_ids)
    label = "{}five_prime".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime, 'orf_type'] = label

    msg = "Found {} five_prime ORFs".format(len(leader_ids))
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons,
                                                 min_b_overlap=1)

    trailer_ids = {m.b_info for m in trailer_matches}

    m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_matches]

    m_three_prime = extracted_orfs['id'].isin(trailer_ids)
    label = "{}three_prime".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime, 'orf_type'] = label

    msg = "Found {} three_prime ORFs".format(len(trailer_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within annotated, non-coding transcripts"
    logger.info(msg)

    noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    noncoding_ids = {m.b_info for m in noncoding_matches}

    m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids)
    extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches]

    m_noncoding = extracted_orfs['id'].isin(noncoding_ids)
    label = "{}noncoding".format(args.label_prefix)
    extracted_orfs.loc[m_noncoding, 'orf_type'] = label

    msg = "Found {} noncoding ORFs".format(len(noncoding_ids))
    logger.info(msg)

    # all of the remaining ORFs fall into the "suspect" category
    suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']}

    m_suspect = extracted_orfs['id'].isin(suspect_ids)
    label = "{}suspect".format(args.label_prefix)
    extracted_orfs.loc[m_suspect, 'orf_type'] = label

    n_suspect_ids = len(suspect_ids)
    msg = "Remaining {} ORFs labeled as suspect".format(n_suspect_ids)
    logger.info(msg)

    m_no_orf_type = extracted_orfs['orf_type'].isnull()
    msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type))
    logger.info(msg)

    msg = "Writing ORFs with labels to disk"
    logger.info(msg)

    extracted_orfs = bed_utils.sort(extracted_orfs)

    msg = ("The ORF labels will be written to {} in the next major release.".
           format(args.out))
    logger.warning(msg)

    additional_columns = ['orf_num', 'orf_len', 'orf_type']
    fields = bed_utils.bed12_field_names + additional_columns
    orfs_genomic = extracted_orfs[fields]
    bed_utils.write_bed(orfs_genomic, args.extracted_orfs)

    label_columns = ['id', 'duplicates', 'orf_type']
    extracted_orfs = extracted_orfs[label_columns]
    bed_utils.write_bed(extracted_orfs, args.out)
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""Given a list of ORFs with associated Bayes 
        factors and a fasta sequence file, this script extracts the sequences of the ORFs whose 
        Bayes factor exceeds the given threshold. Finally, biopython is used to translate the 
        selected ORFs into protein sequences. The min-length and minimum-profile-sum filters 
        are applied in the obvious way. For both BF and chi-square predictions, only ORFs 
        which have more reads in the first reading frame than either of the other two will 
        be selected as translated. (This is called the 'frame filter' below.) The selection 
        based on Bayes factors follows this logic: if max_bf_var is given, then it and 
        min_bf_mean are taken as a hard threshold on the estimated Bayes factor mean. 
        If min_bf_likelihood is given, then this min_bf_mean is taken as the boundary value; 
        that is, an ORF is 'translated' if: [P(bf > min_bf_mean)] > min_bf_likelihood.
        If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a
        hard threshold on the mean for selecting translated ORFs. If both max_bf_var and 
        min_bf_likelihood are given, then both filters will be applied and the result will 
        be the intersection. If the --use-chi-square option is given, the significance value is
        Bonferroni-corrected based on the number of ORFs which meet the length, profile
        and frame filters.""")

    parser.add_argument('bayes_factors', help="""The file containing the ORFs and Bayes'
        factors (BED12+).""")

    parser.add_argument('fasta', help="The *genome* fasta file")

    parser.add_argument('predicted_orfs', help="""The (output) BED12+ file containing
        the predicted ORFs.""")

    parser.add_argument('predicted_dna_sequences', help="""The (output) fasta file 
        containing the predicted ORF sequences, as DNA sequences.""")

    parser.add_argument('predicted_protein_sequences', help="""The (output) fasta file 
        containing the predicted ORF sequences, as protein sequences.""")

    parser.add_argument('--select-longest-by-stop', help="""If this flag is given, then
        the selected ORFs will be merged based on stop codons. In particular, only the
        longest translated ORF at each stop codon will be selected.""", action='store_true')

    parser.add_argument('--select-best-overlapping', help="""If this flag is given, then
        only the ORF with the highest estimated Bayes factor will be kept among each
        set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the
        longest ORF at each stop codon, if the --select-longest-by-stop flag is given.""",
                        action='store_true')

    parser.add_argument('--min-length', help="The minimum length to predict an ORF as translated",
                        type=int, default=translation_options['orf_min_length'])
    
    parser.add_argument('--min-bf-mean', help="""The minimum Bayes' factor mean to predict
        an ORF as translated (use --help for more details)""",
                        type=float, default=translation_options['min_bf_mean'])

    parser.add_argument('--max-bf-var', help="""The maximum Bayes' factor variance to predict
        an ORF as translated (use --help for more details).""",
                        type=float, default=translation_options['max_bf_var'])

    parser.add_argument('--min-bf-likelihood', help="""If given, then this is taken a threshold 
        on the likelihood of translation (use --help for more details).""",
                        type=float, default=translation_options['min_bf_likelihood'])

    parser.add_argument('--min-profile', help="""The minimum sum across all reading frames to consider
        an ORF as translated""", type=float, default=translation_options['orf_min_profile_count'])
   
    parser.add_argument('--chi-square-only', help="""If this flag is present, then the
        chi square value will be used to predict ORFs rather than the Bayes' factor.""",
                        action='store_true')

    parser.add_argument('--chisq-significance-level', help="""If using chi square, then this
        value is Bonferroni corrected and used as the significance cutoff, else it is ignored.""",
                        type=float, default=translation_options['chisq_alpha'])

    parser.add_argument('--filtered-orf-types', help=""""A list of ORF types which will be
        removed before selecting the final prediction set.""", nargs='*', default=[])

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # first, extract all of the predictions which exceed the threshold
    msg = "Reading Bayes factor information"
    logger.info(msg)
    
    bayes_factors = bed_utils.read_bed(args.bayes_factors)

    if len(args.filtered_orf_types) > 0:
        filtered_orf_types_str = ','.join(args.filtered_orf_types)
        msg = "Filtering these ORF types: {}".format(filtered_orf_types_str)
        logger.info(msg)

        m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types)
        bayes_factors = bayes_factors[~m_orf_types]

    msg = "Identifying ORFs which meet the prediction thresholds"
    logger.info(msg)

    all_orfs, predicted_orfs = ribo_utils.get_predicted_orfs(
        bayes_factors,
        min_signal=args.min_profile,
        min_length=args.min_length,
        min_bf_mean=args.min_bf_mean, 
        max_bf_var=args.max_bf_var, 
        min_bf_likelihood=args.min_bf_likelihood,
        chisq_alpha=args.chisq_significance_level,
        select_longest_by_stop=args.select_longest_by_stop,
        use_chi_square=args.chi_square_only
    )

    msg = "Number of selected ORFs: {}".format(len(predicted_orfs))
    logger.info(msg)

    if args.select_best_overlapping:

        msg = "Finding overlapping ORFs"
        logger.info(msg)

        merged_intervals = bed_utils.merge_all_intervals(predicted_orfs)

        msg = "Selecting best among overlapping ORFs"
        logger.info(msg)

        predicted_orfs = parallel.apply_iter_simple(
            merged_intervals['merged_ids'], 
            get_best_overlapping_orf, 
            predicted_orfs, 
            progress_bar=True
        )

        predicted_orfs = pd.DataFrame(predicted_orfs)

    msg = "Sorting selected ORFs"
    logger.info(msg)

    predicted_orfs = bed_utils.sort(predicted_orfs)

    msg = "Writing selected ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(predicted_orfs, args.predicted_orfs)

    # now get the sequences
    msg = "Extracting predicted ORFs DNA sequence"
    logger.info(msg)

    split_exons = True
    transcript_sequences = bed_utils.get_all_bed_sequences(
        predicted_orfs, 
        args.fasta, 
        split_exons
    )

    fastx_utils.write_fasta(transcript_sequences,
                            args.predicted_dna_sequences,
                            compress=False)

    # translate the remaining ORFs into protein sequences
    msg = "Converting predicted ORF sequences to amino acids"
    logger.info(msg)

    records = fastx_utils.get_read_iterator(args.predicted_dna_sequences)
    protein_records = {
        r[0]: Bio.Seq.translate(r[1]) for r in records
    }
    
    fastx_utils.write_fasta(
        protein_records.items(), 
        args.predicted_protein_sequences, 
        compress=False
    )
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for "
                        "filtering the peptides from MaxQuant",
                        default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value "
                        "greater than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the "
                        "peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
    call = not args.do_not_call

    programs = ['get-orf-peptide-matches']
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(
        args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(
        args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict,
                                                      'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items(
    ):
        if cell_type not in cell_types:
            msg = (
                "Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue

        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                   format(cell_type_protein))
            logger.warning(msg)
            continue

        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = (
                    "Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue

            peptide_txt_file = config['peptide_files'][peptide_file]

            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".format(
                    peptide_txt_file))
                logger.warning(msg)
                continue

            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'],
                cell_type,
                peptide_file,
                is_filtered=True,
                note=out_note_str)

            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(
                cell_type_protein, peptide_txt_file, peptide_matches,
                num_cpus_str, peptide_filter_field_str,
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently, creating one sparse matrix file (mtx) for "
        "each read length. These are then collected into a 'sparse tensor'.")

    parser.add_argument('config', help="The yaml config file.")
    parser.add_argument('name', help="The name of either one of the 'riboseq_samples'"
        "or 'riboseq_biological_replicates' from the config file.")
    
    parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses"
        "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written"
        "using the matrix market format (base-1 indexing).")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then 'name' will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    parser.add_argument('--add-ids', help="If this flag is present, "
        "then orf_ids will be added to the final output.", action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
 
    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)
    
    orfs = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=orf_note
    )

    exons = filenames.get_exons(
        config['genome_base_path'], 
        config['genome_name'],
        note=orf_note,
        is_orf=True
    )
    
    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # process one sample or all samples from condition
    names = [args.name]
    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # now the relevant files
        bam = filenames.get_riboseq_bam(
            config['riboseq_data'], 
            name, 
            is_unique=is_unique, 
            note=note
        )

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset],
                is_unique=is_unique, 
                note=note
            )

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam,
                orfs,
                exons,
                mtx,
                lengths_str,
                offsets_str,
                seqname_str,
                cpus_str,
                logging_str
            )
            
            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    add_ids_str = ""
    if args.add_ids:
        add_ids_str = "--add-ids"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config,
        args.name,
        args.out,
        is_condition_str,
        add_ids_str,
        logging_str
    )

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Extract the ORFs from the given transcripts and
        write as a BED12+ file. Additional fields, 'orf_len' and 'orf_num', give the length 
        of each ORF and it's index (used to write the ORF profiles). A third additional field
        records duplicated ORFs from transcript variants.''')

    parser.add_argument('transcripts_bed',
                        help='''The BED12 file containing the 
        transcript information.''')

    parser.add_argument('transcripts_fasta',
                        help='''The fasta file containing the 
        spliced transcript sequences.''')

    parser.add_argument('out', help='''The output (BED12+ gz) file.''')

    parser.add_argument('--start-codons',
                        help='''A list of codons which will be treated 
        as start codons when extracting the ORFs.''',
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help='''A list of codons which will be treated 
        as stop codons when extracting the ORFs.''',
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)
    orfs.reset_index(drop=True, inplace=True)

    #  This is done arbitrarily, however we keep all matching
    #  transcripts for reference
    msg = "Marking and removing duplicate ORFs"
    logger.info(msg)

    groupby_duplicates = orfs.groupby(DUPLICATE_FIELDS,
                                      as_index=False).agg({'id': ','.join})
    orfs = orfs.merge(groupby_duplicates, how='left', on=DUPLICATE_FIELDS)
    orfs.drop_duplicates(subset=DUPLICATE_FIELDS, inplace=True, keep='first')
    orfs.rename(columns={'id_x': 'id', 'id_y': 'duplicates'}, inplace=True)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script visualizes the clusters found with "
        "cluster-subcodon-counts.")

    parser.add_argument('pkl',
                        help="The pickled model file created by "
                        "cluster-subcodon-counts")
    parser.add_argument('out', help="The output image")

    parser.add_argument('--title',
                        help="The title for the plot",
                        default=default_title)
    parser.add_argument('--min-weight',
                        help="The minimum weight required to "
                        "show the associated cluster",
                        type=float,
                        default=default_min_weight)
    parser.add_argument('--log',
                        help="If this flag is given, then the plot "
                        "will use a log scale",
                        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading model pickle file"
    logger.info(msg)
    model_pkl = pickle.load(open(args.pkl, 'rb'))

    msg = "Extracting clusters with minimum weight"
    logger.info(msg)

    it = enumerate(zip(model_pkl[0], model_pkl[1]))

    periodic_clusters = []

    total_weight = 0
    for i, (m, w) in it:
        if w > args.min_weight:
            total_weight += w
            periodic_clusters.append(i)

    msg = "Finding linear best fit line"
    logger.info(msg)

    c = model_pkl[0][periodic_clusters, 0]
    x = model_pkl[0][periodic_clusters, 1]
    y = model_pkl[0][periodic_clusters, 2]
    s = model_pkl[1][periodic_clusters]

    fit = math_utils.fit_with_least_squares(x, y, w=s)
    (slope, intercept, power, r_sqr) = fit

    msg = "Plotting clusters"
    logger.info(msg)

    min_val = min(min(x), min(y)) * 0.8
    max_val = max(max(x), max(y)) * 1.2
    lim = (min_val, max_val)

    fig, ax = plt.subplots()

    # axes and labels and things
    ax.set_aspect('equal')
    ax.set_xlabel("Frame +1")
    ax.set_ylabel("Frame +2")

    ax.set_xlim(lim)
    ax.set_ylim(lim)

    if args.log:
        ax.set_xscale('log')
        ax.set_yscale('log')

    cm = plt.cm.Blues

    norm = None
    if args.log:
        norm = matplotlib.colors.LogNorm()

    sc = ax.scatter(x, y, c=c, cmap=cm, s=s * 1000, norm=norm)
    cb = plt.colorbar(sc, ax=ax)
    cb.set_label("In-frame")

    text = "Accounts for {:.0%} of likelihood".format(total_weight)
    ax.annotate(text, (0.25, 0.75), xycoords='axes fraction')

    # draw the fit line
    mpl_utils.plot_trend_line(ax, x, intercept, slope, power)

    # write the fit information
    rsqr_str = "$R^2$ = {:.2f}".format(r_sqr)
    slope_str = "slope = {:.2f}".format(slope)
    intercept_str = "intercept = {:.2f}".format(intercept)
    strs = [rsqr_str, slope_str, intercept_str]
    text = '\n'.join(strs)

    ax.annotate(text, (0.55, 0.15), xycoords='axes fraction')

    if len(args.title) > 0:
        ax.set_title(args.title)

    msg = "Writing the plot to disk"
    logger.info(msg)

    fig.savefig(args.out)
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script uses the peptides.txt file from MaxQuant to determine "
        "which predicted ORFs have some proteomics evidence.\n\nIt contains "
        "some hard-coded field names.")
    parser.add_argument('predicted_proteins',
                        help="The (fasta, protein) file of "
                        "predicted ORFs")
    parser.add_argument('peptides',
                        help="The peptides.txt file produced by MaxQuant")
    parser.add_argument(
        'out',
        help="The output (csv.gz) file containing the predicted "
        "ORFs and their coverage")

    parser.add_argument('--num-cpus',
                        help="The number of CPUs to use for searching",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for filtering "
                        "the peptides from MaxQuant",
                        default=default_peptide_filter_field)
    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value greater "
                        "than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the --peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.",
        type=int,
        default=default_num_groups)

    parser.add_argument(
        '--num-peptides',
        help="If n>0, then only the first n peptide "
        "sequences will be used to calculate coverage. This is for testing.",
        type=int,
        default=default_num_peptides)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[get-orf-peptide-matches]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading and filtering peptides"
    logger.info(msg)

    peptides = pd.read_csv(args.peptides, sep=args.peptide_separator)
    mask_filter = peptides[
        args.peptide_filter_field] < args.peptide_filter_value
    peptides = peptides[mask_filter]
    peptide_sequences = pd.DataFrame(peptides['Sequence'])

    if args.num_peptides > 0:
        peptide_sequences = peptide_sequences.head(args.num_peptides)

    msg = "Number of filtered peptides: {}".format(len(peptide_sequences))
    logger.info(msg)

    msg = "Reading predicted ORFs into a data frame"
    logger.info(msg)

    # TODO: use read iterator
    predicted_orfs = fastx_utils.get_read_iterator(args.predicted_proteins)
    orf_ids = []
    orf_sequences = []

    for orf_id, seq in predicted_orfs:
        orf_ids.append(orf_id)
        orf_sequences.append(seq)

    predicted_orfs_df = pd.DataFrame()
    predicted_orfs_df['orf_id'] = orf_ids
    predicted_orfs_df['orf_sequence'] = orf_sequences

    msg = "Searching for matching peptides"
    logger.info(msg)

    peptide_matches = parallel.apply_parallel_split(peptide_sequences,
                                                    args.num_cpus,
                                                    find_matching_orfs_group,
                                                    predicted_orfs_df,
                                                    progress_bar=True,
                                                    num_groups=args.num_groups)

    # filter out the Nones to avoid DataFrame conversion problems
    msg = "Joining results back into large data frame"
    logger.info(msg)

    peptide_matches = [pm for pm in peptide_matches if pm is not None]
    peptide_matches = pd.concat(peptide_matches)

    # now, we have a data frame of matches (fields: peptide, orf_id)
    msg = "Getting peptide coverage of ORFs"
    logger.info(msg)

    # first, count the matches for each ORF
    peptide_matches_groups = peptide_matches.groupby('orf_id')

    orf_matches = parallel.apply_parallel_groups(peptide_matches_groups,
                                                 args.num_cpus,
                                                 count_matches,
                                                 progress_bar=True)
    orf_matches = pd.DataFrame(orf_matches)

    # then join back on the original list of ORFs to have entries for ORFs
    # with no peptide matches
    predicted_orf_coverage = pd.merge(predicted_orfs_df,
                                      orf_matches,
                                      on='orf_id',
                                      how="left")

    # and patch the holes in the data frame
    predicted_orf_coverage = predicted_orf_coverage.fillna(0)

    msg = "Writing coverage information to disk"
    pandas_utils.write_df(predicted_orf_coverage, args.out, index=False)
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This is a helper script to submit a set of
        samples to SLURM. It can also be used to run a set of samples sequentially. Due to limitations 
        on the config file specification, all of the samples must use the same reference indices 
        obtained by running 'create-base-genome-profile.""")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('--tmp', help="The temp directory", default=None)

    parser.add_argument('--overwrite',
                        help="""If this flag is present, existing files 
        will be overwritten.""",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="""If this flag is present, then only
        the pre-processing part of the pipeline will be called, i.e. profiles
        will be created for each sample specified in the config file, but no predictions
        will be made.""",
                        action='store_true')

    parser.add_argument('--merge-replicates',
                        help="""If this flag is present, then
        the ORF profiles from the replicates will be merged before making the final
        predictions""",
                        action='store_true')

    parser.add_argument('--run-replicates',
                        help="""If this flag is given with the
        --merge-replicates flag, then both the replicates and the individual
        samples will be run. This flag has no effect if --merge-replicates is not
        given.""",
                        action='store_true')

    parser.add_argument('-k',
                        '--keep-intermediate-files',
                        help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""",
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index',
        'genome_base_path', 'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # handle all option strings to call the pipeline script
    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the pipeline script, but that it does not run anything
    call = not args.do_not_call
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # check if we only want to create the profiles, in this case
    # we call run-rpbp-pipeline with the --profiles-only option
    profiles_only_str = ""
    if args.profiles_only:
        if args.merge_replicates:
            msg = (
                "The --profiles-only option was given, this option has"
                "precedence, and it will override the --merge-replicates option!"
            )
            logger.warning(msg)
        args.merge_replicates = False
        profiles_only_str = "--profiles-only"

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles, but we still make predictions
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = (
            "The --run-replicates option was given without the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)

    # collect the job_ids in case we are using slurm and need to merge replicates
    rep_to_condition = ribo_utils.get_riboseq_replicates_reverse_map(config)
    job_ids_mapping = defaultdict(list)

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp, "{}_rpbp".format(sample_name))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, args.config, sample_name, args.num_cpus, mem_str, tmp_str,
            do_not_call_str, overwrite_str, profiles_only_str,
            keep_intermediate_str, logging_str, star_str, flexbar_str)

        job_id = slurm.check_sbatch(cmd, args=args)
        job_ids_mapping[rep_to_condition[sample_name]].append(job_id)

    # now, if we are running the "standard" pipeline, we are done
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):

        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, condition_name, args.num_cpus, do_not_call_str,
            overwrite_str, logging_str, merge_replicates_str)

        job_ids = job_ids_mapping[condition_name]
        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Ejemplo n.º 17
0
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script runs all of the processing necessary to 
        produce the signals used for ORF translation prediction. In particular, it creates the 
        metagene profiles, selected the periodic fragments and generate the ORF profiles.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('--tmp', help="The location for temp files", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files 
        will be overwritten.""", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be deleted. 
        This feature is implemented piecemeal. If the --do-not-call flag is given, 
        then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'create-base-genome-profile',
        'remove-multimapping-reads',
        'extract-metagene-profiles',
        'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets',
        'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    models_base = config.get('models_base', default_models_base)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(
        riboseq_raw_data,
        args.config,
        args.name,
        args.num_cpus,
        do_not_call_argument,
        overwrite_argument,
        logging_str,
        star_str,
        tmp_str,
        flexbar_str,
        keep_intermediate_str,
        mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    # in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=True)

    # Extract the metagene profiles

    start_upstream_str = utils.get_config_argument(config,
                                                   'metagene_start_upstream',
                                                   'start-upstream',
                                                   default=metagene_options['metagene_start_upstream'])
    start_downstream_str = utils.get_config_argument(config,
                                                     'metagene_start_downstream',
                                                     'start-downstream',
                                                     default=metagene_options['metagene_start_downstream'])
    end_upstream_str = utils.get_config_argument(config,
                                                 'metagene_end_upstream',
                                                 'end-upstream',
                                                 default=metagene_options['metagene_end_upstream'])
    end_downstream_str = utils.get_config_argument(config,
                                                   'metagene_end_downstream',
                                                   'end-downstream',
                                                   default=metagene_options['metagene_end_downstream'])

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format(
        riboseq_bam_filename,
        transcript_bed,
        metagene_profiles,
        args.num_cpus,
        logging_str,
        start_upstream_str,
        start_downstream_str,
        end_upstream_str,
        end_downstream_str))

    in_files = [riboseq_bam_filename, transcript_bed]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'],
        args.name,
        is_unique=is_unique,
        note=note)

    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(config,
                                                          'periodic_offset_start',
                                                          default=metagene_options['periodic_offset_start'])
    periodic_offset_end_str = utils.get_config_argument(config,
                                                        'periodic_offset_end',
                                                        default=metagene_options['periodic_offset_end'])
    metagene_profile_length_str = utils.get_config_argument(config,
                                                            'metagene_profile_length',
                                                            default=metagene_options['metagene_profile_length'])
    seed_str = utils.get_config_argument(config,
                                         'seed',
                                         default=metagene_options['seed'])
    chains_str = utils.get_config_argument(config,
                                           'chains',
                                           default=metagene_options['chains'])
    iterations_str = utils.get_config_argument(config,
                                               'metagene_iterations',
                                               'iterations',
                                               default=metagene_options['metagene_iterations'])

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(metagene_profiles,
                                         metagene_profile_bayes_factors,
                                         args.num_cpus,
                                         periodic_models_str,
                                         non_periodic_models_str,
                                         periodic_offset_start_str,
                                         periodic_offset_end_str,
                                         metagene_profile_length_str,
                                         seed_str,
                                         chains_str,
                                         iterations_str,
                                         logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors,
                                                 periodic_offsets)

    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config,
                                                                   args.name,
                                                                   args.do_not_call,
                                                                   is_unique=is_unique,
                                                                   default_params=metagene_options)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
               "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
               "and/or min_metagene_bf_likelihood. Quitting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
        unique_filename,
        orfs_genomic,
        exons_file,
        profiles_filename,
        lengths_str,
        offsets_str,
        logging_str,
        seqname_prefix_str,
        args.num_cpus))

    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    # todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=call)
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Visualize the read counts at each filtering step.")

    parser.add_argument('alignment_counts',
                        help="The (csv) alignment counts "
                        "(created with get-all-filtering-counts)")
    parser.add_argument('out', help="The output image file")

    parser.add_argument(
        '--alignment-counts-order',
        help="The fields to use "
        "from the alignment_counts file. The order should range from the "
        "least strict filter to most strict filter.",
        nargs='+',
        default=default_alignment_counts_order)

    parser.add_argument(
        '--alignment-counts-names',
        help="The text to use for "
        "the kept fields in the plot. The order of the names must match that "
        "of --alignment-counts-order.",
        nargs='+',
        default=default_alignment_counts_names)

    parser.add_argument(
        '--without-rrna',
        help="If this flag is present, then "
        "a default set of fields excluding the reads mapping to ribosomal "
        "sequences will be used.",
        action='store_true')

    parser.add_argument('--config',
                        help="""The config file, if using 
        pretty names, "riboseq_sample_name_map" must be defined""",
                        type=str,
                        default=None)

    parser.add_argument('--title', help="The title of the plot", default=None)

    parser.add_argument('--fontsize',
                        help="The font size to use for most of "
                        "the text in the plot",
                        type=int,
                        default=default_fontsize)

    parser.add_argument('--legend-fontsize',
                        help="The font size to use for "
                        "the legend labels",
                        type=int,
                        default=default_legend_fontsize)

    parser.add_argument('--ymax',
                        help="The maximum for the y-axis",
                        type=int,
                        default=default_ymax)
    parser.add_argument('--ystep',
                        help="The step size for ticks on the "
                        "y-axis.",
                        type=int,
                        default=default_ystep)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.without_rrna:
        msg = "Using the default without rrna field order"
        logger.info(msg)

        args.alignment_counts_order = without_rrna_order
        args.alignment_counts_names = without_rrna_names

    # we really need the fields from most- to least-restrictive
    args.alignment_counts_order = args.alignment_counts_order[::-1]
    args.alignment_counts_names = args.alignment_counts_names[::-1]

    msg = "Reading counts"
    logger.info(msg)

    alignment_counts = pd.read_csv(args.alignment_counts)
    alignment_counts = alignment_counts.sort_values('note')

    msg = "Calculating the diff counts"
    logger.info(msg)
    alignment_diff_counts = mpl_utils.get_diff_counts(
        alignment_counts[args.alignment_counts_order])

    df = pd.DataFrame(alignment_diff_counts)
    df.columns = args.alignment_counts_names

    names = alignment_counts['note'].reset_index(drop=True)
    df['name'] = names

    if args.config:
        try:
            config = yaml.load(open(args.config), Loader=yaml.FullLoader)
            sample_name_map = ribo_utils.get_sample_name_map(config)
            df['display_name'] = df['name'].apply(lambda x: sample_name_map[x])
        except:
            msg = 'Fall back to "name", cannot fetch "display_name" from config file.'
            logger.warning(msg)
            df['display_name'] = df['name']
    else:
        df['display_name'] = df['name']

    msg = "Creating the stacked bar chart"
    logger.info(msg)

    fig, ax = plt.subplots()

    pal = sns.palettes.color_palette(palette="Set3",
                                     n_colors=len(args.alignment_counts_names))

    gap = 0.15

    # if we aren't given information about the y-axis, try to guess
    if args.ymax is None:
        field = args.alignment_counts_order[-2]
        max_count = alignment_counts[field].max()

        if args.ystep > max_count:
            args.ystep = np.ceil(max_count / 4)

        args.ymax = (np.ceil(max_count / args.ystep) * args.ystep) + 1

    yticks = np.arange(0, args.ymax, args.ystep)

    bars = mpl_utils.create_stacked_bar_graph(
        ax,
        alignment_diff_counts,
        colors=pal,
        x_tick_labels=df['display_name'],
        y_ticks=yticks,
        y_tick_labels=yticks,
        gap=gap,
        end_gaps=True,
        stack_labels=args.alignment_counts_names,
        y_title='Reads',
        log=False,
        font_size=args.fontsize,
        edge_colors='0.5')

    ax.legend(loc='upper center',
              bbox_to_anchor=(0.5, -0.6),
              ncol=3,
              fontsize=args.legend_fontsize,
              title="Filter",
              frameon=True)

    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0e'))
    mpl_utils.set_label_fontsize(ax, args.fontsize)
    mpl_utils.set_legend_title_fontsize(ax, args.fontsize)

    if args.title is not None:
        ax.set_title(args.title, fontsize=args.fontsize)

    msg = "Writing the plot to disk"
    logger.info(msg)
    fig.savefig(args.out, bbox_inches='tight')
Ejemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script uses the mygene.info service to find annotations "
        "for the transcripts associated with the ORFs in the given bed file. In "
        "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, "
        "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.")

    parser.add_argument('bed', help="The bed file")
    parser.add_argument('out', help="The output file. Its type will be inferred "
        "from its extension.")

    parser.add_argument('--do-not-trim', help="By default, the script will "
        "attempt to trim transcript identifiers such that they are valid Ensembl "
        "identifiers. If this flag is given, no trimming will take place.",
        action='store_true')

    parser.add_argument('--scopes', help="A list of scopes to use when querying "
        "mygene.info. Please see the documentation for more information about "
        "valid scopes: http://mygene.info/doc/query_service.html#available_fields",
        nargs='*', default=default_scopes)

    parser.add_argument('--do-not-convert-ids', help="By default, the script will "
        "treat the identifiers in the file as transcript identifiers. It first "
        "maps those to gene identifiers, and then it uses those to find the "
        "gene annotations. If the identifiers are already gene ids (or whatever "
        "is specified by scopes), then the first mapping is not necessary and "
        "can be skipped using this flag.", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    convert_ids = not args.do_not_convert_ids

    msg = "Reading the bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)
    bed = bed[fields_to_keep]

    msg = "Extracting transcript ids"
    logger.info(msg)
    trim = not args.do_not_trim
    orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim)
    orf_ids_df = pd.DataFrame(orf_ids)

    if convert_ids:
        msg = "Querying transcript to gene id mapping"
        logger.info(msg)
        gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id'])
    else:
        gene_ids = pd.DataFrame()
        gene_ids['transcript_id'] = orf_ids_df['transcript_id']
        gene_ids['gene_id'] = orf_ids_df['transcript_id']

    msg = "Querying gene annotations"
    logger.info(msg)
    res_df = mygene_utils.query_mygene(gene_ids['gene_id'])

    msg = "Combining gene annotations with transcript ids"
    logger.info(msg)
    res_df = gene_ids.merge(res_df, on='gene_id', how='inner')

    msg = "Combining transcript annotations with ORF ids"
    logger.info(msg)
    orf_ids_fields = ['transcript_id', 'orf_id']
    res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner')

    msg = "Combining ORF annotations with ORF predictions"
    logger.info(msg)
    res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left')

    msg = "Writing ORF annotations to disk"
    logger.info(msg)
    pandas_utils.write_df(res_df, args.out, index=False)
Ejemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This script runs the Rp-Bp pipelines 
        on a given sample. It requires a YAML config file that includes a number of keys. 
        Please see the documentation for a complete description.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp', help="The temp directory", default=None)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="""If this flag is present, then only 
        the ORF profiles will be created""",
                        action='store_true')

    parser.add_argument('-k',
                        '--keep-intermediate-files',
                        help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""",
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path',
        'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # if using slurm, submit the script, but we cannot use sys.argv directly
    # as the shell strips the quotes around the arguments
    if args.use_slurm:
        cmd = "{}".format(' '.join("'" + s + "'" if '"' in s else s
                                   for s in sys.argv))
        slurm.check_sbatch(cmd, args=args)
        return

    # handle all option strings to call programs
    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(shlex.quote(args.tmp))

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
           .format(args.raw_data, args.config, args.name, args.num_cpus,
                   mem_str, do_not_call_str, overwrite_str,
                   keep_intermediate_str, logging_str, tmp_str, star_str,
                   flexbar_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(
        args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str,
        logging_str))
    shell_utils.check_call(cmd)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all of the peptides which match to multiple "
        "ORFs from the results found with get-all-orf-peptide-matches.")

    parser.add_argument('peptide_matches', help="The peptide matches file produced "
        "by get-all-orf-peptide-matches")
    parser.add_argument('out', help="A similar peptide matches file which "
        "contains only peptides which match to a unique ORF")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading peptide matches"
    logger.info(msg)

    peptide_matches = pd.read_csv(args.peptide_matches)

    msg = "Splitting the grouped matches into individual peptide matches"
    logger.info(msg)

    matches = parallel.apply_parallel(  peptide_matches, 
                                        args.num_cpus, 
                                        parse_matches, 
                                        progress_bar=True)

    msg = "Removing peptides which match to multiple ORFs"
    logger.info(msg)

    matches = utils.remove_nones(matches)
    matches = utils.flatten_lists(matches)
    matches_df = pd.DataFrame(matches)
    unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False)

    msg = "Merging the ORF-peptide matches back to single records"
    logger.info(msg)

    unique_groups = unique_matches_df.groupby('orf_id')
    merged_unique_groups = parallel.apply_parallel_groups(  unique_groups, 
                                                            args.num_cpus, 
                                                            merge_group, 
                                                            progress_bar=True)

    merged_unique_df = pd.DataFrame(merged_unique_groups)

    msg = "Re-adding the ORFs which no longer have peptide matches"
    logger.info(msg)

    m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id'])
    peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0
    peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0

    peps = [merged_unique_df, peptide_matches[~m_still_has_match]]
    merged_unique_df = pd.concat(peps)

    msg = "Writing the ORFs with unique matches to disk"
    logger.info(msg)

    pandas_utils.write_df(merged_unique_df, args.out, index=False)
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        pandas_utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Prepare a reference genome and matching 
        annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation 
        and ORF translation prediction pipeline.''')

    parser.add_argument('config', help='''The (yaml) configuration file''')

    parser.add_argument('--overwrite',
                        help='''If this flag is present, existing files
        will be overwritten.''',
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check required callable programs, config keys and files
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]
    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]
    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    call = not args.do_not_call

    # the rRNA index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = pgrm_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the ORFs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # we will use these files later in the pipeline
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                                  config['genome_name'],
                                                  note=config.get('orf_note'),
                                                  is_annotated=True,
                                                  is_de_novo=False)

    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'))

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            additional_columns = ['orf_num', 'orf_len', 'orf_type']
            fields = bed_utils.bed12_field_names + additional_columns
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_labeled_orfs = filenames.get_labels(
            config['genome_base_path'],
            config['genome_name'],
            note=config.get('orf_note'),
            is_annotated=False,
            is_de_novo=True)

        label_files = [annotated_labeled_orfs, de_novo_labeled_orfs]

        label_files_str = ' '.join(label_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            labeled_orfs, label_files_str))
        logger.info(msg)

        if call:
            # not sorted, as is
            concatenated_bed = bed_utils.concatenate(label_files,
                                                     sort_bed=False)
            bed_utils.write_bed(concatenated_bed, labeled_orfs)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'],
                                                   config['de_novo_gtf'],
                                                   gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=call)
        else:
            msg = (
                "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                "for reference and do novo annotations. Symlink to reference annotations created."
            )
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # if we do not have a de novo assembly, symlink the files

        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(annotated_labeled_orfs):
            shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs,
                                       call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text."
    )
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument(
        '--show-read-length-bfs',
        help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.",
        action='store_true')

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument('--min-visualization-count',
                        help="Read lengths with fewer than this "
                        "number of reads will not be included in the report.",
                        type=int,
                        default=metagene_options['min_metagene_image_count'])

    parser.add_argument('--image-type',
                        help="The type of image types to create. This "
                        "must be an extension which matplotlib can interpret.",
                        default=default_image_type)

    parser.add_argument(
        '-c',
        '--create-fastqc-reports',
        help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.",
        action='store_true')

    parser.add_argument('--tmp',
                        help="If the fastqc reports are created, "
                        "they will use this location for temp files",
                        default=None)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    if args.note is not None:
        config['note'] = args.note
    note = config.get('note', None)

    sample_names = sorted(config['riboseq_samples'].keys())

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts', 'samtools',
        'visualize-read-filtering-counts', 'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc', 'java'])

    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create the read filtering information...
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(
            config['riboseq_data'], name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        'min_metagene_profile_count',
        metagene_options['min_metagene_profile_count'])

    min_bf_mean = config.get('min_metagene_bf_mean',
                             metagene_options['min_metagene_bf_mean'])

    max_bf_var = config.get('max_metagene_bf_var',
                            metagene_options['max_metagene_bf_var'])

    min_bf_likelihood = config.get(
        'min_metagene_bf_likelihood',
        metagene_options['min_metagene_bf_likelihood'])

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)

        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out,
                            read_filtering_caption,
                            label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out,
                      "Read length distributions",
                      label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out,
                           ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=False,
                note=note,
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=True,
                note=note,
                image_type=args.image_type)

            msg = "Looking for image file: {}".format(
                read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out,
                                     read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)

            msg = "Looking for image file: {}".format(
                unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out,
                                     unique_read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    unique_read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

        latex.end_table(out)
        latex.clearpage(out)

        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))

            periodic_offsets = filenames.get_periodic_offsets(
                config['riboseq_data'], name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used

                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"

                if max_bf_var is not None:
                    if ((length_row['highest_peak_bf_mean'] <= min_bf_mean) or
                            length_row['highest_peak_bf_var'] >= max_bf_var):
                        offset_status = "BF mean too small or BF var too high"

                if min_bf_likelihood is not None:
                    likelihood = 1 - scipy.stats.norm.cdf(
                        min_bf_mean, length_row['highest_peak_bf_mean'],
                        np.sqrt(length_row['highest_peak_bf_var']))
                    if likelihood <= min_bf_likelihood:
                        offset_status = "Likehood too small"

                if (max_bf_var is None) and (min_bf_likelihood is None):
                    if length_row['highest_peak_bf_mean'] <= min_bf_mean:
                        offset_status = "BF mean too small"

                if length_row[
                        'highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"

                if length_row[
                        'highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'],
                    name,
                    image_type=args.image_type,
                    is_unique=is_unique,
                    length=length,
                    note=note)

                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(
                    length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)

                i += 1
                if i % 2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)

                if args.show_read_length_bfs:

                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'],
                        name,
                        image_type=args.image_type,
                        is_unique=is_unique,
                        length=length,
                        note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)

                    i += 1
                    if i % 2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

            if i % 2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)

            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note,
                    subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base,
                            orf_type,
                            strand,
                            image_type=args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i % 4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i > 0) and (i % 4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                     args.num_cpus, create_fastqc_reports,
                                     config, args)
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out',
                        help="The base output directory for the latex report")

    parser.add_argument(
        '--show-unfiltered-orfs',
        help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included",
        action='store_true')

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument('--uniprot',
                        help="The uniprot ORF lengths, if available",
                        default=default_uniprot)
    parser.add_argument('--uniprot-label',
                        help="The label to use for the uniprot ORFs in "
                        "the plot",
                        default=default_uniprot_label)

    parser.add_argument('--image-type',
                        help="The format of the image files. This must be "
                        "a format usable by matplotlib.",
                        default=default_image_type)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    parser.add_argument(
        '--show-chisq',
        help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.",
        action='store_true')

    parser.add_argument('-t',
                        '--tmp',
                        help="A location for temporary files",
                        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = ['riboseq_data', 'riboseq_samples']
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())

        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(
                ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config,
                    sample_name,
                    is_unique=is_unique,
                    default_params=metagene_options)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page
            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config,
                    sample_name,
                    is_unique=is_unique,
                    default_params=metagene_options)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)

            i = 0
            for sample_name in sample_names:

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        sample_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None

                caption = "ORF type metagene profiles: {}".format(
                    replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        replicate_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.debug(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)