コード例 #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        pandas_utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
コード例 #2
0
def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False):
    """ Process a GTF file into its ORFs.
    """

    call = not args.do_not_call
    chr_name_file = os.path.join(config['star_index'], 'chrName.txt')
    chr_name_str = "--chr-name-file {}".format(chr_name_file)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    # extract a BED12 of the annotated ORFs
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=is_annotated,
                                       is_de_novo=is_de_novo)

    cmd = ("gtf-to-bed12 {} {} {} {} {}".format(gtf, transcript_bed,
                                                chr_name_str, cpus_str,
                                                logging_str))
    in_files = [gtf]
    out_files = [transcript_bed]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # extract the transcript fasta
    transcript_fasta = filenames.get_transcript_fasta(
        config['genome_base_path'],
        config['genome_name'],
        is_annotated=is_annotated,
        is_de_novo=is_de_novo)

    cmd = ("extract-bed-sequences {} {} {} {}".format(transcript_bed,
                                                      config['fasta'],
                                                      transcript_fasta,
                                                      logging_str))
    in_files = [transcript_bed, config['fasta']]
    out_files = [transcript_fasta]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # extract ORFs from the transcripts using genomic coordinates
    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'),
                                      is_annotated=is_annotated,
                                      is_de_novo=is_de_novo)

    start_codons_str = utils.get_config_argument(config,
                                                 'start_codons',
                                                 default=default_start_codons)

    stop_codons_str = utils.get_config_argument(config,
                                                'stop_codons',
                                                default=default_stop_codons)

    cmd = "extract-orf-coordinates {} {} {} {} {} {} {}".format(
        transcript_bed, transcript_fasta, orfs_genomic, cpus_str,
        start_codons_str, stop_codons_str, logging_str)
    in_files = [transcript_fasta, transcript_bed]
    out_files = [orfs_genomic]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # write the ORF exons, used to label the ORFs
    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'),
                                     is_annotated=is_annotated,
                                     is_de_novo=is_de_novo)

    cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format(
        orfs_genomic, exons_file, args.num_cpus, logging_str))
    in_files = [orfs_genomic]
    out_files = [exons_file]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # label the ORFs
    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=is_annotated,
                                        is_de_novo=is_de_novo)

    annotated_bed = filenames.get_bed(config['genome_base_path'],
                                      config['genome_name'],
                                      is_merged=False,
                                      is_annotated=True)

    orf_exons_str = '--orf-exons {}'.format(exons_file)

    de_novo_str = ""
    if is_de_novo:
        de_novo_str = '--label-prefix "novel_" --filter --nonoverlapping-label "novel"'

    cmd = "label-orfs {} {} {} {} {} {} {}".format(annotated_bed, orfs_genomic,
                                                   labeled_orfs, orf_exons_str,
                                                   de_novo_str, logging_str,
                                                   cpus_str)
    in_files = [annotated_bed, orfs_genomic, exons_file]
    #  ** this function overwrites the input file `orfs_genomic`
    out_files = [labeled_orfs]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)
コード例 #3
0
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script runs all of the processing necessary to 
        produce the signals used for ORF translation prediction. In particular, it creates the 
        metagene profiles, selected the periodic fragments and generate the ORF profiles.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('--tmp', help="The location for temp files", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files 
        will be overwritten.""", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be deleted. 
        This feature is implemented piecemeal. If the --do-not-call flag is given, 
        then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'create-base-genome-profile',
        'remove-multimapping-reads',
        'extract-metagene-profiles',
        'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets',
        'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    models_base = config.get('models_base', default_models_base)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(
        riboseq_raw_data,
        args.config,
        args.name,
        args.num_cpus,
        do_not_call_argument,
        overwrite_argument,
        logging_str,
        star_str,
        tmp_str,
        flexbar_str,
        keep_intermediate_str,
        mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    # in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=True)

    # Extract the metagene profiles

    start_upstream_str = utils.get_config_argument(config,
                                                   'metagene_start_upstream',
                                                   'start-upstream',
                                                   default=metagene_options['metagene_start_upstream'])
    start_downstream_str = utils.get_config_argument(config,
                                                     'metagene_start_downstream',
                                                     'start-downstream',
                                                     default=metagene_options['metagene_start_downstream'])
    end_upstream_str = utils.get_config_argument(config,
                                                 'metagene_end_upstream',
                                                 'end-upstream',
                                                 default=metagene_options['metagene_end_upstream'])
    end_downstream_str = utils.get_config_argument(config,
                                                   'metagene_end_downstream',
                                                   'end-downstream',
                                                   default=metagene_options['metagene_end_downstream'])

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format(
        riboseq_bam_filename,
        transcript_bed,
        metagene_profiles,
        args.num_cpus,
        logging_str,
        start_upstream_str,
        start_downstream_str,
        end_upstream_str,
        end_downstream_str))

    in_files = [riboseq_bam_filename, transcript_bed]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'],
        args.name,
        is_unique=is_unique,
        note=note)

    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(config,
                                                          'periodic_offset_start',
                                                          default=metagene_options['periodic_offset_start'])
    periodic_offset_end_str = utils.get_config_argument(config,
                                                        'periodic_offset_end',
                                                        default=metagene_options['periodic_offset_end'])
    metagene_profile_length_str = utils.get_config_argument(config,
                                                            'metagene_profile_length',
                                                            default=metagene_options['metagene_profile_length'])
    seed_str = utils.get_config_argument(config,
                                         'seed',
                                         default=metagene_options['seed'])
    chains_str = utils.get_config_argument(config,
                                           'chains',
                                           default=metagene_options['chains'])
    iterations_str = utils.get_config_argument(config,
                                               'metagene_iterations',
                                               'iterations',
                                               default=metagene_options['metagene_iterations'])

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(metagene_profiles,
                                         metagene_profile_bayes_factors,
                                         args.num_cpus,
                                         periodic_models_str,
                                         non_periodic_models_str,
                                         periodic_offset_start_str,
                                         periodic_offset_end_str,
                                         metagene_profile_length_str,
                                         seed_str,
                                         chains_str,
                                         iterations_str,
                                         logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors,
                                                 periodic_offsets)

    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config,
                                                                   args.name,
                                                                   args.do_not_call,
                                                                   is_unique=is_unique,
                                                                   default_params=metagene_options)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
               "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
               "and/or min_metagene_bf_likelihood. Quitting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
        unique_filename,
        orfs_genomic,
        exons_file,
        profiles_filename,
        lengths_str,
        offsets_str,
        logging_str,
        seqname_prefix_str,
        args.num_cpus))

    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    # todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=call)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Prepare a reference genome and matching 
        annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation 
        and ORF translation prediction pipeline.''')

    parser.add_argument('config', help='''The (yaml) configuration file''')

    parser.add_argument('--overwrite',
                        help='''If this flag is present, existing files
        will be overwritten.''',
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check required callable programs, config keys and files
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]
    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]
    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    call = not args.do_not_call

    # the rRNA index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = pgrm_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the ORFs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # we will use these files later in the pipeline
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                                  config['genome_name'],
                                                  note=config.get('orf_note'),
                                                  is_annotated=True,
                                                  is_de_novo=False)

    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'))

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            additional_columns = ['orf_num', 'orf_len', 'orf_type']
            fields = bed_utils.bed12_field_names + additional_columns
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_labeled_orfs = filenames.get_labels(
            config['genome_base_path'],
            config['genome_name'],
            note=config.get('orf_note'),
            is_annotated=False,
            is_de_novo=True)

        label_files = [annotated_labeled_orfs, de_novo_labeled_orfs]

        label_files_str = ' '.join(label_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            labeled_orfs, label_files_str))
        logger.info(msg)

        if call:
            # not sorted, as is
            concatenated_bed = bed_utils.concatenate(label_files,
                                                     sort_bed=False)
            bed_utils.write_bed(concatenated_bed, labeled_orfs)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'],
                                                   config['de_novo_gtf'],
                                                   gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=call)
        else:
            msg = (
                "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                "for reference and do novo annotations. Symlink to reference annotations created."
            )
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # if we do not have a de novo assembly, symlink the files

        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(annotated_labeled_orfs):
            shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs,
                                       call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently, creating one sparse matrix file (mtx) for "
        "each read length. These are then collected into a 'sparse tensor'.")

    parser.add_argument('config', help="The yaml config file.")
    parser.add_argument('name', help="The name of either one of the 'riboseq_samples'"
        "or 'riboseq_biological_replicates' from the config file.")
    
    parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses"
        "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written"
        "using the matrix market format (base-1 indexing).")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then 'name' will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    parser.add_argument('--add-ids', help="If this flag is present, "
        "then orf_ids will be added to the final output.", action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
 
    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)
    
    orfs = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=orf_note
    )

    exons = filenames.get_exons(
        config['genome_base_path'], 
        config['genome_name'],
        note=orf_note,
        is_orf=True
    )
    
    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # process one sample or all samples from condition
    names = [args.name]
    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # now the relevant files
        bam = filenames.get_riboseq_bam(
            config['riboseq_data'], 
            name, 
            is_unique=is_unique, 
            note=note
        )

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset],
                is_unique=is_unique, 
                note=note
            )

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam,
                orfs,
                exons,
                mtx,
                lengths_str,
                offsets_str,
                seqname_str,
                cpus_str,
                logging_str
            )
            
            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    add_ids_str = ""
    if args.add_ids:
        add_ids_str = "--add-ids"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config,
        args.name,
        args.out,
        is_condition_str,
        add_ids_str,
        logging_str
    )

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)