Beispiel #1
0
def add_overlaps(diff_micropeptides, overlap_file, name, bed_df_a, bed_df_b,
                 exons):
    msg = "Reading overlaps file: {}".format(overlap_file)
    logger.info(msg)
    overlap_bed = bed_utils.read_bed(overlap_file)

    msg = "Finding overlaps"
    a_overlaps = bed_utils.get_bed_overlaps(bed_df_a, overlap_bed, exons=exons)
    a_overlaps_ids = {to.a_info for to in a_overlaps}

    b_overlaps = bed_utils.get_bed_overlaps(bed_df_b, overlap_bed, exons=exons)
    b_overlaps_ids = {to.a_info for to in b_overlaps}

    m_match_a = diff_micropeptides['A'].isin(a_overlaps_ids)
    m_match_b = diff_micropeptides['B'].isin(b_overlaps_ids)

    match_name_a = "{}_A".format(name)
    match_name_b = "{}_B".format(name)

    diff_micropeptides[match_name_a] = 'No'
    diff_micropeptides[match_name_b] = 'No'

    diff_micropeptides.loc[m_match_a, match_name_a] = 'Yes'
    diff_micropeptides.loc[m_match_b, match_name_b] = 'Yes'

    return diff_micropeptides
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script clusters the ORFs based on their subcodon "
        "counts using a DP-GMM; the means and weights of the clusters are "
        "written to a pickle file which consists of a list. The first element "
        "of the list are the means, and the second is the weights.")

    parser.add_argument('bf', help="The bayes factor file containing counts")
    parser.add_argument('out', help="The output (pickle) file")

    parser.add_argument('--fraction', help="The top <fraction> genes, based "
        "on normalized read counts, will be used for clustering", type=float,
        default=default_fraction)

    parser.add_argument('--max-iter', help="The maximum number of iterations "
        "for clustering", type=int, default=default_max_iter)
    parser.add_argument('--n-components', help="The maximum number of "
        "clusters", type=int, default=default_n_components)
    parser.add_argument('--seed', help="The seed for the random number "
        "generator", type=int, default=default_seed)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading BF file"
    logger.info(msg)
    bf = bed_utils.read_bed(args.bf)

    msg = "Extracting top k% of ORFs"
    logger.info(msg)
    
    # calculate the normalized read coverage
    total_read_coverage = bf["x_1_sum"] + bf["x_2_sum"] + bf["x_3_sum"]
    rpk = total_read_coverage / bf['orf_len']
    sorted_rpk_indices = np.argsort(rpk)

    # and get the best args.fraction of them
    num_orfs = int(len(rpk) * args.fraction)
    top_k_orfs = sorted_rpk_indices.tail(num_orfs)

    msg = "Finding subcodon clusters"
    logger.info(msg)

    x_i_fields = ["x_1_sum", "x_2_sum", "x_3_sum"]
    X = bf.iloc[top_k_orfs.values][x_i_fields]

    model = np_utils.fit_bayesian_gaussian_mixture(X, 
        max_iter=args.max_iter, n_components=args.n_components, seed=args.seed)

    msg = "Writing means and weights to disk"
    logger.info(msg)

    to_pkl = [model.means_, model.weights_]
    pickle.dump(to_pkl, open(args.out, 'wb'))
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script visualizes the metagene profiles for each ORF type "
        "present in a given BED12+ file. It visualizes the mean and variance of normalized "
        "profiles in the first 21-bp, last 21-bp, and across all other 21-bp windows.")

    parser.add_argument('orfs', help="The BED12+ file containing the ORFs")
    parser.add_argument('profiles', help="The (mtx) file containing the ORF profiles")
    parser.add_argument('out', help="The base output name. The output filenames will be of "
        "the form: <out>.<orf-type>.<image-type>.")

    parser.add_argument('--min-profile', help="The minimum value of the sum over the profile "
        "to include it in the analysis", type=float, default=default_min_profile)

    parser.add_argument('--max-orfs', help="At most this many ORFs of each type will be "
        "used to create the figures. They will be sampled randomly from among those "
        "which meet the min-profile constraint.", type=int, default=default_max_orfs)

    parser.add_argument('--title', help="The prefix to use for the title of the plots",
        default=default_title)

    parser.add_argument('--image-type', help="The type of image files to create. The type "
        "must be recognized by matplotlib.", default=default_image_type)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading ORFs"
    logger.info(msg)
    orfs = bed_utils.read_bed(args.orfs)

    msg = "Reading profiles"
    logger.info(msg)
    profiles = scipy.io.mmread(args.profiles).tocsr()

    msg = "Extracting the metagene profiles and creating the images"
    logger.info(msg)

    orf_type_groups = orfs.groupby('orf_type')
    orf_type_groups.apply(extract_profiles_and_plot, profiles, args)

    msg = "Finished"
    logger.info(msg)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script labels the ORFs found with extract-orf-coordinates "
        "based on their exon structure and relation to the annotated, canonical "
        "ORFs. It requires the exon blocks for the ORFs (created with "
        "split-bed12-blocks). It completely reads in the ORFs, so unless otherwise "
        "desired for some reason, the input and output files can be the same.")

    parser.add_argument('annotated_transcripts',
                        help="The annotated transcripts "
                        "for the genome, in bed12+ format")
    parser.add_argument('extracted_orfs',
                        help="The ORFs extracted from the "
                        "transcripts, in bed12+ format")
    parser.add_argument('orf_exons',
                        help="The exon blocks for the ORFs, in "
                        "bed6+ format")

    parser.add_argument('out', help="The output (bed12+.gz) file")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use for "
                        "a few parts of the script",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is given, then ORFs "
        "which are completely covered by an annotated transcript are discarded. "
        "Presumably, this is used to filter uninteresting ORFs from de novo "
        "assemblies.",
        action='store_true')

    parser.add_argument(
        '-e',
        '--annotated-exons',
        help="If the --filter flag is "
        "given, the annotated transcript exons can optionally be provided with "
        "this option. If they are not given, they will be split from the annotated "
        "transcripts. That is generally not a very expensive operation relative to "
        "everything else in the labeling script. If --filter is not given, then "
        "these are ignored.",
        default=default_annotated_exons)

    parser.add_argument(
        '-n',
        '--nonoverlapping-label',
        help="If this option is "
        "given, then ORFs which do not overlap the annotated transcripts at all "
        "will be given this label. Otherwise, they will be labeled as \"suspect\"",
        default=default_nonoverlapping_label)

    parser.add_argument(
        '-l',
        '--label-prefix',
        help="This string is prepended "
        "to all labels assigned to ORFs. For example, it is a useful way to "
        "indicate ORFs from de novo assemblies are \"novel.\" In any case, this "
        "*is not* prepended to \"canonical\" ORFs.",
        default=default_label_prefix)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading annotated transcripts"
    logger.info(msg)
    annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts)

    msg = "Reading extracted ORFs and exons"
    logger.info(msg)
    extracted_orfs = bed_utils.read_bed(args.extracted_orfs)
    extracted_orf_exons = bed_utils.read_bed(args.orf_exons)

    msg = "Found {} extracted ORFs with {} exons".format(
        len(extracted_orfs), len(extracted_orf_exons))
    logger.debug(msg)

    # check if we want to remove the extracted_orfs completely covered by
    # the annotated transcripts
    if args.filter:
        msg = ("Removing extracted ORFs which are completely covered by the "
               "annotated transcripts")
        logger.info(msg)

        # we need the annotated transcript exons
        if args.annotated_exons is None:
            msg = "Splitting the annotated transcripts into exon blocks"
            logger.info(msg)

            annotated_exons = bed_utils.split_bed12(annotated_transcripts,
                                                    num_cpus=args.num_cpus,
                                                    progress_bar=True)
        else:
            msg = "Reading the annotated transcript exons"
            logger.info(msg)

            annotated_exons = bed_utils.read_bed(args.annotated_exons)

        msg = "Finding completely covered extracted ORFs"
        logger.info(msg)

        nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons,
                                                    annotated_exons,
                                                    min_a_overlap=1)

        m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs = extracted_orfs[m_unfiltered]

        # also discard the unnecessary exons
        m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[m_unfiltered]

        msg = "After filtering, {} extracted ORFs remain".format(
            len(extracted_orfs))
        logger.info(msg)

    # if the nonoverlapping-label is given, annotate and remove the ORFs
    # which do not at all overlap the annotations
    if args.nonoverlapping_label is not None:

        nonoverlapping_ids = bed_utils.subtract_bed(
            extracted_orfs,
            annotated_transcripts,
            exons_a=extracted_orf_exons,
            exons_b=annotated_exons)

        m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping]

        m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs.loc[m_nonoverlapping,
                           'orf_type'] = args.nonoverlapping_label

        msg = ("Found {} ORFs completely nonoverlapping annotated transcripts".
               format(len(nonoverlapping_ids)))
        logger.info(msg)

    msg = "Removing the annotated UTRs from the transcripts"
    logger.info(msg)
    canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts,
                                                     num_cpus=args.num_cpus)

    msg = "Splitting the canonical ORFs into exons"
    logger.info(msg)
    canonical_orf_exons = bed_utils.split_bed12(canonical_orfs,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)

    msg = "Extracting annotated 5' leader regions"
    logger.info(msg)
    five_prime_regions = bed_utils.retain_all_five_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(five_prime_regions) == 0:
        msg = "No annotated 5' leader regions were found"
        logger.warning(msg)

    msg = "Splitting the 5' leaders into exons"
    logger.info(msg)
    five_prime_exons = bed_utils.split_bed12(five_prime_regions,
                                             num_cpus=args.num_cpus,
                                             progress_bar=True)

    msg = "Extracting annotated 3' trailer regions"
    logger.info(msg)
    three_prime_regions = bed_utils.retain_all_three_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(three_prime_regions) == 0:
        msg = "No annotated 3' trailer regions were found"
        logger.warning(msg)

    msg = "Splitting the 3' trailers into exons"
    logger.info(msg)
    three_prime_exons = bed_utils.split_bed12(three_prime_regions,
                                              num_cpus=args.num_cpus,
                                              progress_bar=True)

    msg = "Splitting noncoding transcripts into exons"
    logger.info(msg)

    m_no_thick_start = annotated_transcripts['thick_start'] == -1
    m_no_thick_end = annotated_transcripts['thick_end'] == -1
    m_no_thick = m_no_thick_start & m_no_thick_end
    noncoding_transcripts = annotated_transcripts[m_no_thick]

    noncoding_exons = bed_utils.split_bed12(noncoding_transcripts,
                                            num_cpus=args.num_cpus,
                                            progress_bar=True)

    msg = "Marking canonical and extracted ORFs with the same stop codon"
    logger.info(msg)

    # first, add the true ORF end
    m_forward_canonical = canonical_orfs['strand'] == '+'
    m_reverse_canonical = canonical_orfs['strand'] == '-'

    m_forward_extracted = extracted_orfs['strand'] == '+'
    m_reverse_extracted = extracted_orfs['strand'] == '-'

    canonical_orfs['orf_end'] = canonical_orfs['end']
    canonical_orfs.loc[m_reverse_canonical,
                       'orf_end'] = canonical_orfs.loc[m_reverse_canonical,
                                                       'start']

    extracted_orfs['orf_end'] = extracted_orfs['end']
    extracted_orfs.loc[m_reverse_extracted,
                       'orf_end'] = extracted_orfs.loc[m_reverse_extracted,
                                                       'start']

    # now, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs
    merge_fields = ['seqname', 'strand', 'orf_end']
    canonical_extracted_orf_ends = canonical_orfs.merge(
        extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted'])

    # now, pull this into a set
    zip_it = zip(canonical_extracted_orf_ends['id_canonical'],
                 canonical_extracted_orf_ends['id_extracted'])
    canonical_extracted_matching_ends = {(c, a) for c, a in zip_it}

    msg = "Finding ORFs which exactly overlap the canonical ORFs"
    logger.info(msg)

    exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                               extracted_orf_exons,
                                               min_a_overlap=1,
                                               min_b_overlap=1)

    exact_match_orf_ids = {o.b_info for o in exact_matches}
    m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids)
    extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches]

    m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids)
    extracted_orfs.loc[m_canonical, 'orf_type'] = 'canonical'

    msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids))
    logger.info(msg)

    msg = "Finding ORFs which are extended versions of the canonical ORFs"
    logger.info(msg)

    extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                  extracted_orf_exons,
                                                  min_a_overlap=1)

    # make sure the "end"s match before calling something an extended match
    extended_match_ids = {
        m.b_info
        for m in tqdm.tqdm(extended_matches)
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_extended_matches]

    m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids)

    l = "{}canonical_extended".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_extended, 'orf_type'] = l

    msg = "Found {} canonical_extended ORFs".format(len(extended_match_ids))
    logger.info(msg)

    msg = "Finding ORFs which are truncated versions of the canonical ORFs"
    logger.info(msg)

    truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    # make sure the "end"s match before calling something a truncated match
    truncated_match_ids = {
        m.b_info
        for m in tqdm.tqdm(truncated_matches)
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_truncated_matches]

    m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids)

    l = "{}canonical_truncated".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_truncated, 'orf_type'] = l

    msg = "Found {} canonical_truncated ORFs".format(len(truncated_match_ids))
    logger.info(msg)

    msg = ("Labeling ORFs which are completely covered by a canonical ORF but "
           "do not share its stop codon")
    logger.info(msg)

    # anything in "truncated matches" which *does not* share a stop codon with
    # the match is a "within" orf
    within_ids = {
        m.b_info
        for m in truncated_matches if m.b_info not in truncated_match_ids
    }

    m_within_matches = extracted_orf_exons['id'].isin(within_ids)
    extracted_orf_exons = extracted_orf_exons[~m_within_matches]

    m_within = extracted_orfs['id'].isin(within_ids)

    l = "{}within".format(args.label_prefix)
    extracted_orfs.loc[m_within, 'orf_type'] = l

    msg = "Found {} within ORFs".format(len(within_ids))
    logger.info(msg)

    msg = "Finding out-of-frame overlaps"
    logger.info(msg)
    out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                      extracted_orf_exons)

    msg = "Finding leader overlaps"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons)

    msg = "Finding trailer overlaps"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons)

    msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a "
           "canonical ORF and annotated leaders or trailers")
    logger.info(msg)

    out_of_frame_ids = {m.b_info for m in out_of_frame_matches}
    leader_ids = {m.b_info for m in leader_matches}
    trailer_ids = {m.b_info for m in trailer_matches}

    leader_overlap_ids = out_of_frame_ids & leader_ids
    trailer_overlap_ids = out_of_frame_ids & trailer_ids

    m_leader_overlap_matches = extracted_orf_exons['id'].isin(
        leader_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches]

    m_trailer_overlap_matches = extracted_orf_exons['id'].isin(
        trailer_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches]

    m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids)

    l = "{}five_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = l

    m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids)

    l = "{}three_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = l

    msg = "Found {} five_prime_overlap ORFs".format(len(leader_overlap_ids))
    logger.info(msg)

    msg = "Found {} three_prime_overlap ORFs".format(len(trailer_overlap_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 5' leaders"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons,
                                                min_b_overlap=1)
    leader_ids = {m.b_info for m in leader_matches}

    m_leader_matches = extracted_orf_exons['id'].isin(leader_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_matches]

    m_five_prime = extracted_orfs['id'].isin(leader_ids)

    l = "{}five_prime".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime, 'orf_type'] = l

    msg = "Found {} five_prime ORFs".format(len(leader_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 3' trailers"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons,
                                                 min_b_overlap=1)
    trailer_ids = {m.b_info for m in trailer_matches}

    m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_matches]

    m_three_prime = extracted_orfs['id'].isin(trailer_ids)

    l = "{}three_prime".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime, 'orf_type'] = l

    msg = "Found {} three_prime ORFs".format(len(trailer_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within annotated, noncoding transcripts"
    logger.info(msg)

    noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    noncoding_ids = {m.b_info for m in noncoding_matches}

    m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids)
    extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches]

    m_noncoding = extracted_orfs['id'].isin(noncoding_ids)

    l = "{}noncoding".format(args.label_prefix)
    extracted_orfs.loc[m_noncoding, 'orf_type'] = l

    msg = "Found {} noncoding ORFs".format(len(noncoding_ids))
    logger.info(msg)

    # all of the remaining ORFs fall into the "suspect" category
    suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']}

    m_suspect = extracted_orfs['id'].isin(suspect_ids)

    l = "{}suspect".format(args.label_prefix)
    extracted_orfs.loc[m_suspect, 'orf_type'] = l

    msg = "Found {} \"suspect\" ORFs".format(len(suspect_ids))
    logger.info(msg)

    m_no_orf_type = extracted_orfs['orf_type'].isnull()

    msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type))
    logger.info(msg)

    msg = "Writing ORFs with types to disk"
    logger.info(msg)

    fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
    extracted_orfs = extracted_orfs[fields]
    extracted_orfs = bed_utils.sort(extracted_orfs)

    bed_utils.write_bed(extracted_orfs, args.out)
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a pie chart which shows the proportion of "
        "each ORF type in a given BED12+ file. Optionally, the ORFs can be grouped "
        "into similar types.")

    parser.add_argument('orfs', help="The BED12+ file with the ORFs")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('--title',
                        help="The title to use for the plot",
                        default=default_title)

    parser.add_argument('--use-groups',
                        help="If this flag is given, the the ORFs "
                        "will be grouped",
                        action='store_true')

    args = parser.parse_args()

    orfs = bed_utils.read_bed(args.orfs)

    strands = ['+', '-']
    fracs = []
    labels = []
    for strand in ['+', '-']:
        m_strand = orfs['strand'] == strand
        orf_type_groups = orfs[m_strand].groupby('orf_type')
        counts = orf_type_groups.size()

        if args.use_groups:
            lab = ribo_utils.orf_type_labels
            fr = [get_orf_label_counts(counts, l) for l in lab]
        else:
            fr = counts.values
            lab = np.array(counts.index)

        lab = ["{} ({})".format(l, f) for l, f in zip(lab, fr)]

        fracs.append(fr)
        labels.append(lab)

    fig, axes = plt.subplots(ncols=2, figsize=(10, 5))

    cmap = plt.cm.Blues
    colors = cmap(np.linspace(0., 1., len(labels[0])))

    # forward strand ORFs

    extra_artists = []
    if sum(fracs[0]) > 0:
        patches, texts = axes[0].pie(fracs[0], colors=colors)
        lgd = axes[0].legend(patches,
                             labels[0],
                             loc="center right",
                             bbox_to_anchor=(0, 0.5))
        axes[0].set_title("Strand: {}".format(strands[0]))

        extra_artists.append(lgd)
    else:
        title = "Strand: {}. No ORFs".format(strands[0])
        axes[0].set_title(title)
        axes[0].set_axis_off()

    # reverse strand ORFs
    if sum(fracs[1]) > 0:
        patches, texts = axes[1].pie(fracs[1], colors=colors)
        lgd = axes[1].legend(patches,
                             labels[1],
                             loc="center right",
                             bbox_to_anchor=(2.0, 0.5))
        axes[1].set_title("Strand: {}".format(strands[1]))
        extra_artists.append(lgd)
    else:
        title = "Strand: {}. No ORFs".format(strands[1])
        axes[1].set_title(title)
        axes[1].set_axis_off()

    if len(args.title) > 0:
        sup = fig.suptitle(args.title)
        extra_artists.append(sup)

    fig.savefig(args.out,
                bbox_extra_artists=extra_artists,
                bbox_inches='tight')
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts all of the ORFs from the given transcripts. "
        "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives "
        "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG "
        "output for this script is _very_ verbose. It is not recommended to run this "
        "script with that logging level.")

    parser.add_argument('transcripts_bed',
                        help="The bed12 file containing the "
                        "transcript information")

    parser.add_argument('transcripts_fasta',
                        help="The fasta file containing the "
                        "spliced transcript sequences")

    parser.add_argument('out', help="The output (bed12+1 gz) file")

    parser.add_argument('--start-codons',
                        help="A list of codons which will be "
                        "treated as start codons when extracting ORFs",
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help="A list of codons which will be "
                        "treated as stop codons when extracting ORFs",
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)

    msg = "Removing duplicate ORFs"
    logger.info(msg)

    orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script uses the mygene.info service to find annotations "
        "for the transcripts associated with the ORFs in the given bed file. In "
        "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, "
        "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.")

    parser.add_argument('bed', help="The bed file")
    parser.add_argument('out', help="The output file. Its type will be inferred "
        "from its extension.")

    parser.add_argument('--do-not-trim', help="By default, the script will "
        "attempt to trim transcript identifiers such that they are valid Ensembl "
        "identifiers. If this flag is given, no trimming will take place.",
        action='store_true')

    parser.add_argument('--scopes', help="A list of scopes to use when querying "
        "mygene.info. Please see the documentation for more information about "
        "valid scopes: http://mygene.info/doc/query_service.html#available_fields",
        nargs='*', default=default_scopes)

    parser.add_argument('--do-not-convert-ids', help="By default, the script will "
        "treat the identifiers in the file as transcript identifiers. It first "
        "maps those to gene identifiers, and then it uses those to find the "
        "gene annotations. If the identifiers are already gene ids (or whatever "
        "is specified by scopes), then the first mapping is not necessary and "
        "can be skipped using this flag.", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    convert_ids = not args.do_not_convert_ids

    msg = "Reading the bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)
    bed = bed[fields_to_keep]

    msg = "Extracting transcript ids"
    logger.info(msg)
    trim = not args.do_not_trim
    orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim)
    orf_ids_df = pd.DataFrame(orf_ids)

    if convert_ids:
        msg = "Querying transcript to gene id mapping"
        logger.info(msg)
        gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id'])
    else:
        gene_ids = pd.DataFrame()
        gene_ids['transcript_id'] = orf_ids_df['transcript_id']
        gene_ids['gene_id'] = orf_ids_df['transcript_id']

    msg = "Querying gene annotations"
    logger.info(msg)
    res_df = mygene_utils.query_mygene(gene_ids['gene_id'])

    msg = "Combining gene annotations with transcript ids"
    logger.info(msg)
    res_df = gene_ids.merge(res_df, on='gene_id', how='inner')

    msg = "Combining transcript annotations with ORF ids"
    logger.info(msg)
    orf_ids_fields = ['transcript_id', 'orf_id']
    res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner')

    msg = "Combining ORF annotations with ORF predictions"
    logger.info(msg)
    res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left')

    msg = "Writing ORF annotations to disk"
    logger.info(msg)
    utils.write_df(res_df, args.out, index=False)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script constructs the profile for each ORF. It "
        "first adjusts the mapped read positions to properly align with "
        "the P-sites. Second, it uses a custom chrom-sweep algorithm to "
        "find the coverage of each position in each exon of each ORF. Finally, "
        "the ORF exons are glued together to find the profile of the entire ORF."
    )

    parser.add_argument('bam',
                        help="The bam file including filtered (unique, "
                        "etc.) alignments")
    parser.add_argument('orfs', help="The (bed12) file containing the ORFs")
    parser.add_argument('exons', help="The (bed6+2) file containing the exons")
    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles")

    parser.add_argument(
        '-l',
        '--lengths',
        help="If any values are given, "
        "then only reads which have those lengths will be included in the "
        "signal construction.",
        type=int,
        default=default_lengths,
        nargs='*')
    parser.add_argument(
        '-o',
        '--offsets',
        help="The 5' end of reads will be "
        "shifted by this amount. There must be one offset value for each "
        "length (given by the --lengths argument.",
        type=int,
        default=default_offsets,
        nargs='*')

    parser.add_argument('-k',
                        '--num-exons',
                        help="If  k>0, then only the "
                        "first k exons will be processed.",
                        type=int,
                        default=default_num_exons)
    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into "
        "which to split the exons. More groups means the progress bar is "
        "updated more frequently but incurs more overhead because of the "
        "parallel calls.",
        type=int,
        default=default_num_groups)

    parser.add_argument('--seqname-prefix',
                        help="If present, this string "
                        "will be prepended to the seqname field of the ORFs.",
                        default=default_seqname_prefix)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    # make sure the number of lengths and offsets match
    if len(args.lengths) != len(args.offsets):
        msg = "The number of --lengths and --offsets do not match."
        raise ValueError(msg)

    # make sure the necessary files exist
    required_files = [args.bam, args.orfs, args.exons]
    msg = "[extract-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Finding P-sites"
    logger.info(msg)

    p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets)

    # we do not need the data frame anymore, so save some memory
    msg = "Reading exons"
    logger.info(msg)
    exons = bed_utils.read_bed(args.exons)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bed_utils.read_bed(args.orfs)

    if len(args.seqname_prefix) > 0:
        orfs['seqname'] = args.seqname_prefix + orfs['seqname']
        exons['seqname'] = args.seqname_prefix + exons['seqname']

    if args.num_exons > 0:
        exons = exons.head(args.num_exons)

    num_orfs = orfs['orf_num'].max() + 1
    max_orf_len = orfs['orf_len'].max()

    msg = "Adding the ORF index to the exons"
    logger.info(msg)

    orf_fields = ['id', 'orf_num']
    exons_orfs = exons.merge(orfs[orf_fields], on='id')

    msg = "Splitting exons and P-sites"
    logger.info(msg)
    exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups)

    exons_dfs = []
    psites_dfs = []

    for group_index, exon_group in exon_groups:
        # pull out only the p-sites that come from these chromosomes
        seqnames = set(exon_group['seqname'].unique())
        m_psites = p_sites['seqname'].isin(seqnames)

        exons_dfs.append(exon_group)
        psites_dfs.append(p_sites[m_psites])

    # we no longer need the full list of psites
    del p_sites
    del exons_orfs
    del exon_groups
    del exons
    gc.collect()
    exons_psites = zip(exons_dfs, psites_dfs)

    msg = "Finding all P-site intersections"
    logger.info(msg)

    sum_profiles = parallel.apply_parallel_iter(exons_psites,
                                                args.num_cpus,
                                                get_all_p_site_intersections,
                                                num_orfs,
                                                max_orf_len,
                                                progress_bar=True,
                                                total=args.num_groups)

    msg = "Combining the ORF profiles into one matrix"
    logger.info(msg)

    f = lambda x, y: x + y

    sum_profiles = functools.reduce(f, sum_profiles)
    sum_profiles_lil = sum_profiles.tolil()

    msg = "Flipping the reverse strand profiles"
    logger.info(msg)

    m_reverse = orfs['strand'] == '-'
    reverse_orfs = orfs[m_reverse]

    for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()):
        orf_num = reverse_orf['orf_num']

        if sum_profiles[orf_num].sum() == 0:
            continue

        orf_len = reverse_orf['orf_len']
        dense = utils.to_dense(sum_profiles, orf_num, length=orf_len)
        dense = dense[::-1]
        sum_profiles_lil[orf_num, :orf_len] = dense

    msg = "Writing the sparse matrix to disk"
    logger.info(msg)
    math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a bar chart which shows the count of "
        "each ORF type in a given BED12+ file. Optionally, the ORFs can be "
        "grouped into similar types.")

    parser.add_argument('orfs', help="The BED12+ file with the ORFs")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('--title',
                        help="The title to use for the plot",
                        default=default_title)

    parser.add_argument('--use-groups',
                        help="If this flag is given, the ORFs "
                        "will be grouped",
                        action='store_true')

    parser.add_argument('--fontsize', default=default_fontsize)
    parser.add_argument('--legend-fontsize', default=default_legend_fontsize)
    parser.add_argument('--ymax', type=int, default=default_ymax)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()

    msg = "Reading bed file"
    logger.info(msg)

    bed = bed_utils.read_bed(args.orfs)

    if args.use_groups:
        bed['orf_type_group'] = bed['orf_type'].map(
            ribo_utils.orf_type_labels_reverse_mapping)

        orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts[
            'orf_type_group'].map(ribo_utils.orf_type_labels_display_name_map)
    else:
        orf_type_counts = bed.groupby(['orf_type', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type'].map(
            ribo_utils.orf_type_display_name_map)

    msg = "Creating the bar chart"

    color = sns.palettes.color_palette("Set3", n_colors=3)

    fig, ax = plt.subplots(figsize=(9, 5))
    sns.barplot(x="display_name",
                y="count",
                hue="strand",
                data=orf_type_counts,
                ax=ax,
                zorder=-1,
                palette='Set3')

    sns.despine()

    ax.legend(loc='upper right',
              bbox_to_anchor=(1.0, 0.95),
              fontsize=args.legend_fontsize,
              frameon=True,
              framealpha=0.9,
              title="Strand")
    mpl_utils.set_legend_title_fontsize(ax, args.fontsize)

    ax.set_yscale('log')
    ax.set_ylim((1, args.ymax))

    ax.set_ylabel("Number of ORFs", fontsize=args.fontsize)
    ax.set_xlabel("", fontsize=0)

    # rotate the ORF type names
    mpl_utils.set_ticklabels_fontsize(ax, args.fontsize)
    mpl_utils.set_ticklabel_rotation(ax, axis='x', rotation=90)

    # place the ORF type names in the middle of the bar
    for ticklabel in ax.xaxis.get_ticklabels():
        p = ticklabel.get_position()
        ticklabel.set_position((p[0], 0.1))
        ticklabel.set_verticalalignment('bottom')

    if args.title is not None:
        ax.set_title(args.title, fontsize=args.fontsize)

    if args.out is not None:
        fig.savefig(args.out, bbox_inches='tight')
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "Given a list of ORFs with associated Bayes factors and a fasta "
        "sequence file, this script extracts the sequences of the ORFs whose Bayes factor "
        "exceeds the given threshold. Finally, biopython is used to translate the "
        "selected ORFs into protein sequences.\n\n"
        "The min-length and minimum-profile-sum filters are applied in the obvious way.\n\n"
        "For both BF and chi-square predictions, only ORFs which have more reads in the "
        "first reading frame than either of the other two will be selected as translated. "
        "(This is called the 'frame filter' below.)\n\n"
        "The selection based on Bayes factors follows this logic: if max_bf_var is given, "
        "then it and min_bf_mean are taken as a hard threshold on the estimated Bayes "
        "factor mean. If min_bf_likelihood is given, then this min_bf_mean is taken as the "
        "boundary value; that is, an ORF is \"translated\" if:\n\n"
        "\t\t[P(bf > min_bf_mean)] > min_bf_likelihood\n\n"
        "If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a "
        "hard threshold on the mean for selecting translated ORFs.\n\n"
        "If both max_bf_var and min_bf_likelihood are given, then both filters will be "
        "applied and the result will be the intersection.\n\n"
        "If the --use-chi-square option is given, the significance value is "
        "Bonferroni-corrected based on the number of ORFs which meet the length, profile "
        "and frame filters.")

    parser.add_argument('bayes_factors',
                        help="The file containing the ORFs and Bayes' "
                        "factors (BED12+)")
    parser.add_argument('fasta', help="The *genome* fasta file")
    parser.add_argument('predicted_orfs',
                        help="The (output) BED12+ file containing "
                        "the predicted ORFs.")
    parser.add_argument(
        'predicted_dna_sequences',
        help="The (output) fasta file "
        "containing the predicted ORF sequences, as DNA sequences")
    parser.add_argument(
        'predicted_protein_sequences',
        help="The (output) fasta file "
        "containing the predicted ORF sequences, as protein sequences")

    parser.add_argument(
        '--select-longest-by-stop',
        help="If this flag is given, then "
        "the selected ORFs will be merged based on stop codons. In particular, only the "
        "longest translated ORF at each stop codon will be selected.",
        action='store_true')

    parser.add_argument(
        '--select-best-overlapping',
        help="If this flag is given, then "
        "only the ORF with the highest estimated Bayes factor will be kept among each "
        "set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the "
        "longest ORF at each stop codon, if the --select-longest-by-stop flag is "
        "given.",
        action='store_true')

    parser.add_argument('--min-length',
                        help="The minimum length to predict an ORF "
                        "as translated",
                        type=int,
                        default=default_min_length)

    parser.add_argument('--min-bf-mean',
                        help="The minimum Bayes' factor mean to predict "
                        "an ORF as translated (use --help for more details)",
                        type=float,
                        default=default_min_bf_mean)
    parser.add_argument('--max-bf-var',
                        help="The maximum Bayes' factor variance to predict "
                        "an ORF as translated (use --help for more details)",
                        type=float,
                        default=default_max_bf_var)

    parser.add_argument(
        '--min-bf-likelihood',
        help="If given, then this is taken a threshold "
        "on the likelihood of translation (use --help for more details)",
        type=float,
        default=default_min_bf_likelihood)

    parser.add_argument(
        '--use-chi-square',
        help="If this flag is present, the the "
        "chi square value will be used to predict ORFs rather than the Bayes' factor",
        action='store_true')
    parser.add_argument(
        '--chisq-significance-level',
        help="If using chi square, then this "
        "value is Bonferroni corrected and used as the significance cutoff",
        type=float,
        default=default_chisq_significance_level)

    parser.add_argument('--filtered-orf-types',
                        help="A list of ORF types which will be "
                        "removed before selecting the final prediction set.",
                        nargs='*',
                        default=default_filtered_orf_types)

    parser.add_argument(
        '--filter-non-canonical-overlaps',
        help="If this flag is given, then "
        "--filtered-orf-types will be extended with the non-canonical overlap types ({})."
        .format(non_canonical_overlap_orf_types_str),
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # first, extract all of the predictions which exceed the threshold
    msg = "Reading Bayes factor information"
    logger.info(msg)

    bayes_factors = bed_utils.read_bed(args.bayes_factors)

    if args.filter_non_canonical_overlaps:
        args.filtered_orf_types.extend(non_canonical_overlap_orf_types)

    if len(args.filtered_orf_types) > 0:
        filtered_orf_types_str = ','.join(args.filtered_orf_types)
        msg = "Filtering these ORF types: {}".format(filtered_orf_types_str)
        logger.info(msg)

        m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types)
        bayes_factors = bayes_factors[~m_orf_types]

    msg = "Identifying ORFs which meet the prediction thresholds"
    logger.info(msg)

    all_orfs, bf_orfs, chisq_orfs = ribo_utils.get_predicted_orfs(
        bayes_factors,
        min_bf_mean=args.min_bf_mean,
        max_bf_var=args.max_bf_var,
        min_bf_likelihood=args.min_bf_likelihood,
        min_length=args.min_length,
        chisq_alpha=args.chisq_significance_level,
        select_longest_by_stop=args.select_longest_by_stop)

    if args.use_chi_square:
        predicted_orfs = chisq_orfs
    else:
        predicted_orfs = bf_orfs

    msg = "Number of selected ORFs: {}".format(len(predicted_orfs))
    logger.info(msg)

    if args.select_best_overlapping:

        msg = "Finding overlapping ORFs"
        logger.info(msg)

        merged_intervals = bed_utils.merge_all_intervals(predicted_orfs)

        msg = "Selecting best among overlapping ORFs"
        logger.info(msg)

        predicted_orfs = parallel.apply_iter_simple(
            merged_intervals['merged_ids'],
            get_best_overlapping_orf,
            predicted_orfs,
            progress_bar=True)

        predicted_orfs = pd.DataFrame(predicted_orfs)

    msg = "Sorting selected ORFs"
    logger.info(msg)

    predicted_orfs = bed_utils.sort(predicted_orfs)

    msg = "Writing selected ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(predicted_orfs, args.predicted_orfs)

    # now get the sequences
    msg = "Extracting predicted ORFs DNA sequence"
    logger.info(msg)

    split_exons = True
    transcript_sequences = bed_utils.get_all_bed_sequences(
        predicted_orfs, args.fasta, split_exons)

    fastx_utils.write_fasta(transcript_sequences,
                            args.predicted_dna_sequences,
                            compress=False)

    # translate the remaining ORFs into protein sequences
    msg = "Converting predicted ORF sequences to amino acids"
    logger.info(msg)

    records = fastx_utils.get_read_iterator(args.predicted_dna_sequences)
    protein_records = {r[0]: Bio.Seq.translate(r[1]) for r in records}

    fastx_utils.write_fasta(protein_records.items(),
                            args.predicted_protein_sequences,
                            compress=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a line graph showing the length distributions "
        "of the various types of ORFs. Optionally, it can also include the length "
        "distribution of ORFs downloaded from uniprot. If uniprot ORFs are given, then the "
        "KL-divergence between the type distributions and the uniprot ORFs is calculated.")

    parser.add_argument('orfs', help="The BED12+ file with the ORFs")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", 
        default=default_uniprot)
    parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in "
        "the plot", default=default_uniprot_label)
    parser.add_argument('--title', help="The title to use for the plot", 
        default=default_title)

    
    parser.add_argument('--use-groups', help="If this flag is given, the the ORFs "
        "will be grouped", action='store_true')
    
    args = parser.parse_args()

    orfs = bed_utils.read_bed(args.orfs)

    if args.use_groups:
        orf_lengths = [ get_orf_lengths(orfs, ribo_utils.orf_type_labels_mapping[label]) 
                    for label in ribo_utils.orf_type_labels]

        prediction_labels = [latex.get_latex_safe_string(l) 
                    for l in ribo_utils.orf_type_labels]

        prediction_lengths_list = orf_lengths
    else:
        orf_lengths = [ get_orf_lengths(orfs, [orf_type]) 
                    for orf_type in ribo_utils.orf_types]

        prediction_labels = [latex.get_latex_safe_string(l) 
                    for l in ribo_utils.orf_types]

        prediction_lengths_list = orf_lengths

    if os.path.exists(args.uniprot):
        truth_nt_lengths = bio.get_uniprot_nt_lengths(args.uniprot)
        truth_label = args.uniprot_label
    else:
        truth_nt_lengths = None
        truth_label = None

    #prediction_lengths_list = [bf_lengths, chisq_lengths]
    #prediction_labels = ['BF', r'$\chi^2$']

    # input: truth_nt_lengths (array-like)
    #        prediction_lengths_list (list of array-likes)
    #        truth_label (string)
    #        prediction_labels (list of array-likes)
    #
    # if truth_nt_lengths is not defined, then the KL-divergence calculations
    # will be skipped (and it will not be shown)

    fontsize = 20
    legend_fontsize = 20
    title_fontsize = 20
    linewidth = 4

    # plot the empirical distribution of ORF lengths
    hist_min = 200
    hist_max = 5250
    hist_step = 200
    hist_range = (hist_min, hist_max)
    hist_bins = np.arange(hist_min, hist_max, hist_step)

    if truth_nt_lengths is not None:
        truth_hist, _ = np.histogram(truth_nt_lengths, bins=hist_bins, range=hist_range, density=True)
    else:
        truth_hist = None
        
    prediction_hists = []
    for prediction_lengths in prediction_lengths_list:
        prediction_hist, _ = np.histogram(prediction_lengths, bins=hist_bins, range=hist_range, density=True)
        prediction_hists.append(prediction_hist)

    # now, normalize the histograms
    if truth_hist is not None:
        truth_hist = truth_hist / np.sum(truth_hist)
        truth_hist += 1e-3
        
    for i, prediction_hist in enumerate(prediction_hists):
        prediction_hists[i] = prediction_hist / np.sum(prediction_hist)
        prediction_hists[i] += 1e-3

    kls = []
    if truth_hist is not None:
        for i, prediction_hist in enumerate(prediction_hists):
            kl = math_utils.calculate_symmetric_kl_divergence(truth_hist, prediction_hist, scipy.stats.entropy)
            kls.append(kl)
            
            # and update the label
            prediction_labels[i] = '{}, KL: ${:.2f}$'.format(prediction_labels[i], kl)
            
    if truth_hist is not None:
        truth_hist = 100 * truth_hist
        
    for i, prediction_hist in enumerate(prediction_hists):
        prediction_hists[i] *= 100

    fig, ax = plt.subplots(figsize=(10,5))

    cm = plt.cm.gist_earth

    x = np.arange(len(hist_bins)-1)

    truth_cm_offset = 0.1
    if truth_hist is not None:
        color = cm(truth_cm_offset)
        ax.plot(x, truth_hist, label=truth_label, linewidth=linewidth, color=color)
        
    color_range = 1 - 2*truth_cm_offset
    for i, prediction_hist in enumerate(prediction_hists):
        color = i / len(prediction_hists) * color_range
        color += 2*truth_cm_offset
        color = cm(color)
        ax.plot(x, prediction_hist, label=prediction_labels[i], linewidth=linewidth, color=color)

    ax.set_xlabel('Length (bp)', fontsize=fontsize)
    ax.set_ylabel('\% of predicted ORFs', fontsize=fontsize)
    
    if len(args.title) > 0:
        ax.set_title(args.title, fontsize=fontsize)

    ax.set_xticks(x[::2])
    ax.set_xticklabels(hist_bins[::2], fontsize=fontsize, rotation=90)

    ax.set_ylim((0, 20))
    ax.set_xlim((0, len(hist_bins)))

    # hide the "0" tick label
    yticks = ax.yaxis.get_major_ticks()
    yticks[0].label1.set_visible(False)

    # chop off everything from 3000 on
    index_of_3000 = 14
    ax.set_xlim((0, index_of_3000))
    #ax.set_xlim((0, len(uniprot_hist)-1))

    lgd = ax.legend(loc='center right', fontsize=legend_fontsize, bbox_to_anchor=(1.75,0.5))
    ax.tick_params(axis='both', which='major', labelsize=fontsize)

    fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd,))
Beispiel #13
0
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
            This script uses Hamiltonian MCMC with Stan to estimate translation parameters
            for a set of regions (presumably ORFs). Roughly, it takes as input:

            (1) a set of regions (ORFs) and their corresponding profiles
            (2) a "translated" model which gives the probability that a region is translated
            (3) an "untranslated" model which gives the probability that a region is not translated

            The script first smoothes the profiles using LOWESS. It then calculates
            both the Bayes' factor (using the smoothed profile) and \chi^2 value
            (using the raw counts) for each ORF.
        """
        )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")
    parser.add_argument('regions', help="The regions (ORFs) for which predictions will "
        "be made (BED12+)")
    
    parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi "
        "square test will be performed for each ORF. This can also be a way to get the counts "
        "within each of the ORFs.", action='store_true')
    
    parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+')
    parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+')

    ### filtering options
    parser.add_argument('--orf-types', help="If values are given, then only orfs with "
        "those types are processed.", nargs='*', default=default_orf_types)
    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument('--min-length', help="ORFs with length less than this value will not "
        "be processed", type=int, default=default_min_length)
    parser.add_argument('--max-length', help="ORFs with length greater than this value will not "
        "be processed", type=int, default=default_max_length)
    parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number "
        "of reads) less than this value will not be processed.", type=float, 
        default=default_min_profile)

    ### smoothing options
    parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", 
        type=float, default=default_fraction)

    parser.add_argument('--reweighting-iterations', help="The number of reweighting "
        "iterations to use in LOWESS. Please see the statsmodels documentation for a "
        "detailed description of this parameter.", type=int, default=default_reweighting_iterations)

    ### MCMC options
    parser.add_argument('-s', '--seed', help="The random seeds to use for inference",
        type=int, default=default_seed)
    parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int,
        default=default_chains)
    parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for "
        "each chain", type=int, default=default_iterations)
    
    ### behavior options
    parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed",
        type=int, default=default_num_orfs)
    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will "
        "be written in GZip format", action='store_true')

    parser.add_argument('-g', '--num-groups', help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.", type=int, default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]
    
    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models]
    untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models]
    
    profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)

    bfs_l = parallel.apply_parallel_split(
        regions, 
        args.num_cpus,
        get_all_bayes_factors_args, 
        num_groups=args.num_groups,
        progress_bar=True
    )

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)