Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script labels the ORFs found with extract-orf-coordinates "
        "based on their exon structure and relation to the annotated, canonical "
        "ORFs. It requires the exon blocks for the ORFs (created with "
        "split-bed12-blocks). It completely reads in the ORFs, so unless otherwise "
        "desired for some reason, the input and output files can be the same.")

    parser.add_argument('annotated_transcripts',
                        help="The annotated transcripts "
                        "for the genome, in bed12+ format")
    parser.add_argument('extracted_orfs',
                        help="The ORFs extracted from the "
                        "transcripts, in bed12+ format")
    parser.add_argument('orf_exons',
                        help="The exon blocks for the ORFs, in "
                        "bed6+ format")

    parser.add_argument('out', help="The output (bed12+.gz) file")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use for "
                        "a few parts of the script",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is given, then ORFs "
        "which are completely covered by an annotated transcript are discarded. "
        "Presumably, this is used to filter uninteresting ORFs from de novo "
        "assemblies.",
        action='store_true')

    parser.add_argument(
        '-e',
        '--annotated-exons',
        help="If the --filter flag is "
        "given, the annotated transcript exons can optionally be provided with "
        "this option. If they are not given, they will be split from the annotated "
        "transcripts. That is generally not a very expensive operation relative to "
        "everything else in the labeling script. If --filter is not given, then "
        "these are ignored.",
        default=default_annotated_exons)

    parser.add_argument(
        '-n',
        '--nonoverlapping-label',
        help="If this option is "
        "given, then ORFs which do not overlap the annotated transcripts at all "
        "will be given this label. Otherwise, they will be labeled as \"suspect\"",
        default=default_nonoverlapping_label)

    parser.add_argument(
        '-l',
        '--label-prefix',
        help="This string is prepended "
        "to all labels assigned to ORFs. For example, it is a useful way to "
        "indicate ORFs from de novo assemblies are \"novel.\" In any case, this "
        "*is not* prepended to \"canonical\" ORFs.",
        default=default_label_prefix)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading annotated transcripts"
    logger.info(msg)
    annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts)

    msg = "Reading extracted ORFs and exons"
    logger.info(msg)
    extracted_orfs = bed_utils.read_bed(args.extracted_orfs)
    extracted_orf_exons = bed_utils.read_bed(args.orf_exons)

    msg = "Found {} extracted ORFs with {} exons".format(
        len(extracted_orfs), len(extracted_orf_exons))
    logger.debug(msg)

    # check if we want to remove the extracted_orfs completely covered by
    # the annotated transcripts
    if args.filter:
        msg = ("Removing extracted ORFs which are completely covered by the "
               "annotated transcripts")
        logger.info(msg)

        # we need the annotated transcript exons
        if args.annotated_exons is None:
            msg = "Splitting the annotated transcripts into exon blocks"
            logger.info(msg)

            annotated_exons = bed_utils.split_bed12(annotated_transcripts,
                                                    num_cpus=args.num_cpus,
                                                    progress_bar=True)
        else:
            msg = "Reading the annotated transcript exons"
            logger.info(msg)

            annotated_exons = bed_utils.read_bed(args.annotated_exons)

        msg = "Finding completely covered extracted ORFs"
        logger.info(msg)

        nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons,
                                                    annotated_exons,
                                                    min_a_overlap=1)

        m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs = extracted_orfs[m_unfiltered]

        # also discard the unnecessary exons
        m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[m_unfiltered]

        msg = "After filtering, {} extracted ORFs remain".format(
            len(extracted_orfs))
        logger.info(msg)

    # if the nonoverlapping-label is given, annotate and remove the ORFs
    # which do not at all overlap the annotations
    if args.nonoverlapping_label is not None:

        nonoverlapping_ids = bed_utils.subtract_bed(
            extracted_orfs,
            annotated_transcripts,
            exons_a=extracted_orf_exons,
            exons_b=annotated_exons)

        m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping]

        m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs.loc[m_nonoverlapping,
                           'orf_type'] = args.nonoverlapping_label

        msg = ("Found {} ORFs completely nonoverlapping annotated transcripts".
               format(len(nonoverlapping_ids)))
        logger.info(msg)

    msg = "Removing the annotated UTRs from the transcripts"
    logger.info(msg)
    canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts,
                                                     num_cpus=args.num_cpus)

    msg = "Splitting the canonical ORFs into exons"
    logger.info(msg)
    canonical_orf_exons = bed_utils.split_bed12(canonical_orfs,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)

    msg = "Extracting annotated 5' leader regions"
    logger.info(msg)
    five_prime_regions = bed_utils.retain_all_five_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(five_prime_regions) == 0:
        msg = "No annotated 5' leader regions were found"
        logger.warning(msg)

    msg = "Splitting the 5' leaders into exons"
    logger.info(msg)
    five_prime_exons = bed_utils.split_bed12(five_prime_regions,
                                             num_cpus=args.num_cpus,
                                             progress_bar=True)

    msg = "Extracting annotated 3' trailer regions"
    logger.info(msg)
    three_prime_regions = bed_utils.retain_all_three_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(three_prime_regions) == 0:
        msg = "No annotated 3' trailer regions were found"
        logger.warning(msg)

    msg = "Splitting the 3' trailers into exons"
    logger.info(msg)
    three_prime_exons = bed_utils.split_bed12(three_prime_regions,
                                              num_cpus=args.num_cpus,
                                              progress_bar=True)

    msg = "Splitting noncoding transcripts into exons"
    logger.info(msg)

    m_no_thick_start = annotated_transcripts['thick_start'] == -1
    m_no_thick_end = annotated_transcripts['thick_end'] == -1
    m_no_thick = m_no_thick_start & m_no_thick_end
    noncoding_transcripts = annotated_transcripts[m_no_thick]

    noncoding_exons = bed_utils.split_bed12(noncoding_transcripts,
                                            num_cpus=args.num_cpus,
                                            progress_bar=True)

    msg = "Marking canonical and extracted ORFs with the same stop codon"
    logger.info(msg)

    # first, add the true ORF end
    m_forward_canonical = canonical_orfs['strand'] == '+'
    m_reverse_canonical = canonical_orfs['strand'] == '-'

    m_forward_extracted = extracted_orfs['strand'] == '+'
    m_reverse_extracted = extracted_orfs['strand'] == '-'

    canonical_orfs['orf_end'] = canonical_orfs['end']
    canonical_orfs.loc[m_reverse_canonical,
                       'orf_end'] = canonical_orfs.loc[m_reverse_canonical,
                                                       'start']

    extracted_orfs['orf_end'] = extracted_orfs['end']
    extracted_orfs.loc[m_reverse_extracted,
                       'orf_end'] = extracted_orfs.loc[m_reverse_extracted,
                                                       'start']

    # now, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs
    merge_fields = ['seqname', 'strand', 'orf_end']
    canonical_extracted_orf_ends = canonical_orfs.merge(
        extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted'])

    # now, pull this into a set
    zip_it = zip(canonical_extracted_orf_ends['id_canonical'],
                 canonical_extracted_orf_ends['id_extracted'])
    canonical_extracted_matching_ends = {(c, a) for c, a in zip_it}

    msg = "Finding ORFs which exactly overlap the canonical ORFs"
    logger.info(msg)

    exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                               extracted_orf_exons,
                                               min_a_overlap=1,
                                               min_b_overlap=1)

    exact_match_orf_ids = {o.b_info for o in exact_matches}
    m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids)
    extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches]

    m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids)
    extracted_orfs.loc[m_canonical, 'orf_type'] = 'canonical'

    msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids))
    logger.info(msg)

    msg = "Finding ORFs which are extended versions of the canonical ORFs"
    logger.info(msg)

    extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                  extracted_orf_exons,
                                                  min_a_overlap=1)

    # make sure the "end"s match before calling something an extended match
    extended_match_ids = {
        m.b_info
        for m in tqdm.tqdm(extended_matches)
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_extended_matches]

    m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids)

    l = "{}canonical_extended".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_extended, 'orf_type'] = l

    msg = "Found {} canonical_extended ORFs".format(len(extended_match_ids))
    logger.info(msg)

    msg = "Finding ORFs which are truncated versions of the canonical ORFs"
    logger.info(msg)

    truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    # make sure the "end"s match before calling something a truncated match
    truncated_match_ids = {
        m.b_info
        for m in tqdm.tqdm(truncated_matches)
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_truncated_matches]

    m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids)

    l = "{}canonical_truncated".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_truncated, 'orf_type'] = l

    msg = "Found {} canonical_truncated ORFs".format(len(truncated_match_ids))
    logger.info(msg)

    msg = ("Labeling ORFs which are completely covered by a canonical ORF but "
           "do not share its stop codon")
    logger.info(msg)

    # anything in "truncated matches" which *does not* share a stop codon with
    # the match is a "within" orf
    within_ids = {
        m.b_info
        for m in truncated_matches if m.b_info not in truncated_match_ids
    }

    m_within_matches = extracted_orf_exons['id'].isin(within_ids)
    extracted_orf_exons = extracted_orf_exons[~m_within_matches]

    m_within = extracted_orfs['id'].isin(within_ids)

    l = "{}within".format(args.label_prefix)
    extracted_orfs.loc[m_within, 'orf_type'] = l

    msg = "Found {} within ORFs".format(len(within_ids))
    logger.info(msg)

    msg = "Finding out-of-frame overlaps"
    logger.info(msg)
    out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                      extracted_orf_exons)

    msg = "Finding leader overlaps"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons)

    msg = "Finding trailer overlaps"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons)

    msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a "
           "canonical ORF and annotated leaders or trailers")
    logger.info(msg)

    out_of_frame_ids = {m.b_info for m in out_of_frame_matches}
    leader_ids = {m.b_info for m in leader_matches}
    trailer_ids = {m.b_info for m in trailer_matches}

    leader_overlap_ids = out_of_frame_ids & leader_ids
    trailer_overlap_ids = out_of_frame_ids & trailer_ids

    m_leader_overlap_matches = extracted_orf_exons['id'].isin(
        leader_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches]

    m_trailer_overlap_matches = extracted_orf_exons['id'].isin(
        trailer_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches]

    m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids)

    l = "{}five_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = l

    m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids)

    l = "{}three_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = l

    msg = "Found {} five_prime_overlap ORFs".format(len(leader_overlap_ids))
    logger.info(msg)

    msg = "Found {} three_prime_overlap ORFs".format(len(trailer_overlap_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 5' leaders"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons,
                                                min_b_overlap=1)
    leader_ids = {m.b_info for m in leader_matches}

    m_leader_matches = extracted_orf_exons['id'].isin(leader_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_matches]

    m_five_prime = extracted_orfs['id'].isin(leader_ids)

    l = "{}five_prime".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime, 'orf_type'] = l

    msg = "Found {} five_prime ORFs".format(len(leader_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 3' trailers"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons,
                                                 min_b_overlap=1)
    trailer_ids = {m.b_info for m in trailer_matches}

    m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_matches]

    m_three_prime = extracted_orfs['id'].isin(trailer_ids)

    l = "{}three_prime".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime, 'orf_type'] = l

    msg = "Found {} three_prime ORFs".format(len(trailer_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within annotated, noncoding transcripts"
    logger.info(msg)

    noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    noncoding_ids = {m.b_info for m in noncoding_matches}

    m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids)
    extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches]

    m_noncoding = extracted_orfs['id'].isin(noncoding_ids)

    l = "{}noncoding".format(args.label_prefix)
    extracted_orfs.loc[m_noncoding, 'orf_type'] = l

    msg = "Found {} noncoding ORFs".format(len(noncoding_ids))
    logger.info(msg)

    # all of the remaining ORFs fall into the "suspect" category
    suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']}

    m_suspect = extracted_orfs['id'].isin(suspect_ids)

    l = "{}suspect".format(args.label_prefix)
    extracted_orfs.loc[m_suspect, 'orf_type'] = l

    msg = "Found {} \"suspect\" ORFs".format(len(suspect_ids))
    logger.info(msg)

    m_no_orf_type = extracted_orfs['orf_type'].isnull()

    msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type))
    logger.info(msg)

    msg = "Writing ORFs with types to disk"
    logger.info(msg)

    fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
    extracted_orfs = extracted_orfs[fields]
    extracted_orfs = bed_utils.sort(extracted_orfs)

    bed_utils.write_bed(extracted_orfs, args.out)
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts all of the ORFs from the given transcripts. "
        "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives "
        "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG "
        "output for this script is _very_ verbose. It is not recommended to run this "
        "script with that logging level.")

    parser.add_argument('transcripts_bed',
                        help="The bed12 file containing the "
                        "transcript information")

    parser.add_argument('transcripts_fasta',
                        help="The fasta file containing the "
                        "spliced transcript sequences")

    parser.add_argument('out', help="The output (bed12+1 gz) file")

    parser.add_argument('--start-codons',
                        help="A list of codons which will be "
                        "treated as start codons when extracting ORFs",
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help="A list of codons which will be "
                        "treated as stop codons when extracting ORFs",
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)

    msg = "Removing duplicate ORFs"
    logger.info(msg)

    orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "Given a list of ORFs with associated Bayes factors and a fasta "
        "sequence file, this script extracts the sequences of the ORFs whose Bayes factor "
        "exceeds the given threshold. Finally, biopython is used to translate the "
        "selected ORFs into protein sequences.\n\n"
        "The min-length and minimum-profile-sum filters are applied in the obvious way.\n\n"
        "For both BF and chi-square predictions, only ORFs which have more reads in the "
        "first reading frame than either of the other two will be selected as translated. "
        "(This is called the 'frame filter' below.)\n\n"
        "The selection based on Bayes factors follows this logic: if max_bf_var is given, "
        "then it and min_bf_mean are taken as a hard threshold on the estimated Bayes "
        "factor mean. If min_bf_likelihood is given, then this min_bf_mean is taken as the "
        "boundary value; that is, an ORF is \"translated\" if:\n\n"
        "\t\t[P(bf > min_bf_mean)] > min_bf_likelihood\n\n"
        "If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a "
        "hard threshold on the mean for selecting translated ORFs.\n\n"
        "If both max_bf_var and min_bf_likelihood are given, then both filters will be "
        "applied and the result will be the intersection.\n\n"
        "If the --use-chi-square option is given, the significance value is "
        "Bonferroni-corrected based on the number of ORFs which meet the length, profile "
        "and frame filters.")

    parser.add_argument('bayes_factors',
                        help="The file containing the ORFs and Bayes' "
                        "factors (BED12+)")
    parser.add_argument('fasta', help="The *genome* fasta file")
    parser.add_argument('predicted_orfs',
                        help="The (output) BED12+ file containing "
                        "the predicted ORFs.")
    parser.add_argument(
        'predicted_dna_sequences',
        help="The (output) fasta file "
        "containing the predicted ORF sequences, as DNA sequences")
    parser.add_argument(
        'predicted_protein_sequences',
        help="The (output) fasta file "
        "containing the predicted ORF sequences, as protein sequences")

    parser.add_argument(
        '--select-longest-by-stop',
        help="If this flag is given, then "
        "the selected ORFs will be merged based on stop codons. In particular, only the "
        "longest translated ORF at each stop codon will be selected.",
        action='store_true')

    parser.add_argument(
        '--select-best-overlapping',
        help="If this flag is given, then "
        "only the ORF with the highest estimated Bayes factor will be kept among each "
        "set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the "
        "longest ORF at each stop codon, if the --select-longest-by-stop flag is "
        "given.",
        action='store_true')

    parser.add_argument('--min-length',
                        help="The minimum length to predict an ORF "
                        "as translated",
                        type=int,
                        default=default_min_length)

    parser.add_argument('--min-bf-mean',
                        help="The minimum Bayes' factor mean to predict "
                        "an ORF as translated (use --help for more details)",
                        type=float,
                        default=default_min_bf_mean)
    parser.add_argument('--max-bf-var',
                        help="The maximum Bayes' factor variance to predict "
                        "an ORF as translated (use --help for more details)",
                        type=float,
                        default=default_max_bf_var)

    parser.add_argument(
        '--min-bf-likelihood',
        help="If given, then this is taken a threshold "
        "on the likelihood of translation (use --help for more details)",
        type=float,
        default=default_min_bf_likelihood)

    parser.add_argument(
        '--use-chi-square',
        help="If this flag is present, the the "
        "chi square value will be used to predict ORFs rather than the Bayes' factor",
        action='store_true')
    parser.add_argument(
        '--chisq-significance-level',
        help="If using chi square, then this "
        "value is Bonferroni corrected and used as the significance cutoff",
        type=float,
        default=default_chisq_significance_level)

    parser.add_argument('--filtered-orf-types',
                        help="A list of ORF types which will be "
                        "removed before selecting the final prediction set.",
                        nargs='*',
                        default=default_filtered_orf_types)

    parser.add_argument(
        '--filter-non-canonical-overlaps',
        help="If this flag is given, then "
        "--filtered-orf-types will be extended with the non-canonical overlap types ({})."
        .format(non_canonical_overlap_orf_types_str),
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # first, extract all of the predictions which exceed the threshold
    msg = "Reading Bayes factor information"
    logger.info(msg)

    bayes_factors = bed_utils.read_bed(args.bayes_factors)

    if args.filter_non_canonical_overlaps:
        args.filtered_orf_types.extend(non_canonical_overlap_orf_types)

    if len(args.filtered_orf_types) > 0:
        filtered_orf_types_str = ','.join(args.filtered_orf_types)
        msg = "Filtering these ORF types: {}".format(filtered_orf_types_str)
        logger.info(msg)

        m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types)
        bayes_factors = bayes_factors[~m_orf_types]

    msg = "Identifying ORFs which meet the prediction thresholds"
    logger.info(msg)

    all_orfs, bf_orfs, chisq_orfs = ribo_utils.get_predicted_orfs(
        bayes_factors,
        min_bf_mean=args.min_bf_mean,
        max_bf_var=args.max_bf_var,
        min_bf_likelihood=args.min_bf_likelihood,
        min_length=args.min_length,
        chisq_alpha=args.chisq_significance_level,
        select_longest_by_stop=args.select_longest_by_stop)

    if args.use_chi_square:
        predicted_orfs = chisq_orfs
    else:
        predicted_orfs = bf_orfs

    msg = "Number of selected ORFs: {}".format(len(predicted_orfs))
    logger.info(msg)

    if args.select_best_overlapping:

        msg = "Finding overlapping ORFs"
        logger.info(msg)

        merged_intervals = bed_utils.merge_all_intervals(predicted_orfs)

        msg = "Selecting best among overlapping ORFs"
        logger.info(msg)

        predicted_orfs = parallel.apply_iter_simple(
            merged_intervals['merged_ids'],
            get_best_overlapping_orf,
            predicted_orfs,
            progress_bar=True)

        predicted_orfs = pd.DataFrame(predicted_orfs)

    msg = "Sorting selected ORFs"
    logger.info(msg)

    predicted_orfs = bed_utils.sort(predicted_orfs)

    msg = "Writing selected ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(predicted_orfs, args.predicted_orfs)

    # now get the sequences
    msg = "Extracting predicted ORFs DNA sequence"
    logger.info(msg)

    split_exons = True
    transcript_sequences = bed_utils.get_all_bed_sequences(
        predicted_orfs, args.fasta, split_exons)

    fastx_utils.write_fasta(transcript_sequences,
                            args.predicted_dna_sequences,
                            compress=False)

    # translate the remaining ORFs into protein sequences
    msg = "Converting predicted ORF sequences to amino acids"
    logger.info(msg)

    records = fastx_utils.get_read_iterator(args.predicted_dna_sequences)
    protein_records = {r[0]: Bio.Seq.translate(r[1]) for r in records}

    fastx_utils.write_fasta(protein_records.items(),
                            args.predicted_protein_sequences,
                            compress=False)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + [
                'orf_num', 'orf_len', 'orf_type'
            ]
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

    else:
        # finally, make sure our files are named correctly

        if os.path.exists(annotated_orfs):
            utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            utils.create_symlink(annotated_exons_file, exons_file, call)
Beispiel #5
0
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
            This script uses Hamiltonian MCMC with Stan to estimate translation parameters
            for a set of regions (presumably ORFs). Roughly, it takes as input:

            (1) a set of regions (ORFs) and their corresponding profiles
            (2) a "translated" model which gives the probability that a region is translated
            (3) an "untranslated" model which gives the probability that a region is not translated

            The script first smoothes the profiles using LOWESS. It then calculates
            both the Bayes' factor (using the smoothed profile) and \chi^2 value
            (using the raw counts) for each ORF.
        """
        )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")
    parser.add_argument('regions', help="The regions (ORFs) for which predictions will "
        "be made (BED12+)")
    
    parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi "
        "square test will be performed for each ORF. This can also be a way to get the counts "
        "within each of the ORFs.", action='store_true')
    
    parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+')
    parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+')

    ### filtering options
    parser.add_argument('--orf-types', help="If values are given, then only orfs with "
        "those types are processed.", nargs='*', default=default_orf_types)
    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument('--min-length', help="ORFs with length less than this value will not "
        "be processed", type=int, default=default_min_length)
    parser.add_argument('--max-length', help="ORFs with length greater than this value will not "
        "be processed", type=int, default=default_max_length)
    parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number "
        "of reads) less than this value will not be processed.", type=float, 
        default=default_min_profile)

    ### smoothing options
    parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", 
        type=float, default=default_fraction)

    parser.add_argument('--reweighting-iterations', help="The number of reweighting "
        "iterations to use in LOWESS. Please see the statsmodels documentation for a "
        "detailed description of this parameter.", type=int, default=default_reweighting_iterations)

    ### MCMC options
    parser.add_argument('-s', '--seed', help="The random seeds to use for inference",
        type=int, default=default_seed)
    parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int,
        default=default_chains)
    parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for "
        "each chain", type=int, default=default_iterations)
    
    ### behavior options
    parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed",
        type=int, default=default_num_orfs)
    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will "
        "be written in GZip format", action='store_true')

    parser.add_argument('-g', '--num-groups', help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.", type=int, default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]
    
    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models]
    untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models]
    
    profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)

    bfs_l = parallel.apply_parallel_split(
        regions, 
        args.num_cpus,
        get_all_bayes_factors_args, 
        num_groups=args.num_groups,
        progress_bar=True
    )

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)