def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script adds the ORF profiles from a set of profiles (presumably, "
        "each file corresponds to one replicate from a condition). The script keeps the "
        "profiles in sparse matrix format, so it is fairly efficient.")

    parser.add_argument('profiles', help="The (mtx) files containing the "
        "ORF profiles", nargs='+')
    parser.add_argument('out', help="The (mtx.gz) output file containing the merged profiles")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading first ORF profile"
    logger.info(msg)

    merged_profiles = scipy.io.mmread(args.profiles[0]).tocsr()

    msg = "Adding each additional profile"
    logger.info(msg)

    for profile_file in args.profiles[1:]:
        msg = "Reading file: {}".format(profile_file)
        logger.info(msg)

        profiles = scipy.io.mmread(profile_file).tocsr()
        merged_profiles = merged_profiles + profiles

    msg = "Writing merged profiles to disk"
    logger.info(msg)

    math_utils.write_sparse_matrix(args.out, merged_profiles)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config))

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                       args.num_cpus, get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    utils.write_df(res_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p', '--num-cpus', help="The number of processors to use", 
        type=int, default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config))

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), 
                                        args.num_cpus, 
                                        get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    pandas_utils.write_df(res_df, args.out, index=False)
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script clusters the ORFs based on their subcodon "
        "counts using a DP-GMM; the means and weights of the clusters are "
        "written to a pickle file which consists of a list. The first element "
        "of the list are the means, and the second is the weights.")

    parser.add_argument('bf', help="The bayes factor file containing counts")
    parser.add_argument('out', help="The output (pickle) file")

    parser.add_argument('--fraction', help="The top <fraction> genes, based "
        "on normalized read counts, will be used for clustering", type=float,
        default=default_fraction)

    parser.add_argument('--max-iter', help="The maximum number of iterations "
        "for clustering", type=int, default=default_max_iter)
    parser.add_argument('--n-components', help="The maximum number of "
        "clusters", type=int, default=default_n_components)
    parser.add_argument('--seed', help="The seed for the random number "
        "generator", type=int, default=default_seed)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading BF file"
    logger.info(msg)
    bf = bed_utils.read_bed(args.bf)

    msg = "Extracting top k% of ORFs"
    logger.info(msg)
    
    # calculate the normalized read coverage
    total_read_coverage = bf["x_1_sum"] + bf["x_2_sum"] + bf["x_3_sum"]
    rpk = total_read_coverage / bf['orf_len']
    sorted_rpk_indices = np.argsort(rpk)

    # and get the best args.fraction of them
    num_orfs = int(len(rpk) * args.fraction)
    top_k_orfs = sorted_rpk_indices.tail(num_orfs)

    msg = "Finding subcodon clusters"
    logger.info(msg)

    x_i_fields = ["x_1_sum", "x_2_sum", "x_3_sum"]
    X = bf.iloc[top_k_orfs.values][x_i_fields]

    model = np_utils.fit_bayesian_gaussian_mixture(X, 
        max_iter=args.max_iter, n_components=args.n_components, seed=args.seed)

    msg = "Writing means and weights to disk"
    logger.info(msg)

    to_pkl = [model.means_, model.weights_]
    pickle.dump(to_pkl, open(args.out, 'wb'))
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Summarize the evaluation metrics for all scenarios")
    
    clu.add_config(parser)
    parser.add_argument('out')

    clu.add_cv_options(parser)
    clu.add_num_cpus(parser)

    automl_utils.add_blas_options(parser)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    if len(args.folds) == 0:
        args.folds = list(range(1,11))
    clu.validate_folds_options(args)

    required_keys = ['base_path', 'training_scenarios_path']
    config = as_asl_utils.load_config(args.config, required_keys)

    if automl_utils.spawn_for_blas(args):
        return

    scenarios = utils.list_subdirs(config['training_scenarios_path'])
    use_random_forests = [False] #, True]
    it = itertools.product(scenarios, use_random_forests)

    all_stats = parallel.apply_parallel_iter(
        it,
        args.num_cpus,
        get_stats_summary,
        args,
        config
    )

    msg = "Combining statistics"
    logger.info(msg)

    all_stats_df = pd.DataFrame(all_stats)
    pd_utils.write_df(
        all_stats_df,
        args.out,
        create_path=True,
        do_not_compress=True,
        index=False
    )
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts a fasta file to a fastq file with dummy "
        "quality scores. It is adapted from https://www.biostars.org/p/99886/")

    parser.add_argument('fasta', help="The input fasta file")
    parser.add_argument('out', help="The (output) fastq file")
    parser.add_argument('-c',
                        '--compress',
                        help="If this flag is given, then "
                        "the output file will be compressed",
                        action='store_true')

    parser.add_argument('-q',
                        '--quality-mean',
                        help="The mean quality score to "
                        "use in the fastq file",
                        type=int,
                        default=default_quality_mean)

    parser.add_argument('--quality-std',
                        help="The standard deviation in the "
                        "quality scores",
                        type=int,
                        default=default_quality_std)

    parser.add_argument('--max-quality-score',
                        help="The maximum quality score. "
                        "Scores sampled higher than this will be floored.",
                        type=int,
                        default=default_max_quality_score)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    with ExitStack() as stack:
        fasta = stack.enter_context(open(args.fasta, "r"))
        fastq = stack.enter_context(
            utils.open(args.out, "w", compress=args.compress))

        for record in SeqIO.parse(fasta, 'fasta'):
            quals = np.random.normal(args.quality_mean, args.quality_std,
                                     len(record))
            quals = np.clip(quals, 0, 40)
            record.letter_annotations["phred_quality"] = quals
            SeqIO.write(record, fastq, "fastq")
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script combines the Bayes' factors for the metagene profiles' "
        "periodicity and signal information to select the offsets for beginning the riboseq "
        "signal.\n\nThis script contains some hard-coded field names.")
    parser.add_argument('profile_bayes_factors', help="The (csv) file containing the Bayes' "
        "factors for periodicity at each length")
    parser.add_argument('out', help="The output (csv.gz) file")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # read in the Bayes' factors and profiles
    bayes_factors_df = pd.read_csv(args.profile_bayes_factors)
    length_groups = bayes_factors_df.groupby('length')
    periodic_offsets = length_groups.apply(get_most_periodic_offset)
    pandas_utils.write_df(periodic_offsets, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script visualizes the metagene profiles for each ORF type "
        "present in a given BED12+ file. It visualizes the mean and variance of normalized "
        "profiles in the first 21-bp, last 21-bp, and across all other 21-bp windows.")

    parser.add_argument('orfs', help="The BED12+ file containing the ORFs")
    parser.add_argument('profiles', help="The (mtx) file containing the ORF profiles")
    parser.add_argument('out', help="The base output name. The output filenames will be of "
        "the form: <out>.<orf-type>.<image-type>.")

    parser.add_argument('--min-profile', help="The minimum value of the sum over the profile "
        "to include it in the analysis", type=float, default=default_min_profile)

    parser.add_argument('--max-orfs', help="At most this many ORFs of each type will be "
        "used to create the figures. They will be sampled randomly from among those "
        "which meet the min-profile constraint.", type=int, default=default_max_orfs)

    parser.add_argument('--title', help="The prefix to use for the title of the plots",
        default=default_title)

    parser.add_argument('--image-type', help="The type of image files to create. The type "
        "must be recognized by matplotlib.", default=default_image_type)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading ORFs"
    logger.info(msg)
    orfs = bed_utils.read_bed(args.orfs)

    msg = "Reading profiles"
    logger.info(msg)
    profiles = scipy.io.mmread(args.profiles).tocsr()

    msg = "Extracting the metagene profiles and creating the images"
    logger.info(msg)

    orf_type_groups = orfs.groupby('orf_type')
    orf_type_groups.apply(extract_profiles_and_plot, profiles, args)

    msg = "Finished"
    logger.info(msg)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script visualizes the metagene profiles for each ORF type "
        "present in a given BED12+ file. It visualizes the mean and variance of normalized "
        "profiles in the first 21-bp, last 21-bp, and across all other 21-bp windows.")

    parser.add_argument('orfs', help="The BED12+ file containing the ORFs")
    parser.add_argument('profiles', help="The (mtx) file containing the ORF profiles")
    parser.add_argument('out', help="The base output name. The output filenames will be of "
        "the form: <out>.<orf-type>.<image-type>.")

    parser.add_argument('--min-profile', help="The minimum value of the sum over the profile "
        "to include it in the analysis", type=float, default=default_min_profile)

    parser.add_argument('--max-orfs', help="At most this many ORFs of each type will be "
        "used to create the figures. They will be sampled randomly from among those "
        "which meet the min-profile constraint.", type=int, default=default_max_orfs)

    parser.add_argument('--title', help="The prefix to use for the title of the plots",
        default=default_title)

    parser.add_argument('--image-type', help="The type of image files to create. The type "
        "must be recognized by matplotlib.", default=default_image_type)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading ORFs"
    logger.info(msg)
    orfs = bed_utils.read_bed(args.orfs)

    msg = "Reading profiles"
    logger.info(msg)
    profiles = scipy.io.mmread(args.profiles).tocsr()

    msg = "Extracting the metagene profiles and creating the images"
    logger.info(msg)

    orf_type_groups = orfs.groupby('orf_type')
    orf_type_groups.apply(extract_profiles_and_plot, profiles, args)

    msg = "Finished"
    logger.info(msg)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script adds the ORF profiles from a set of profiles (presumably, "
        "each file corresponds to one replicate from a condition). The script keeps the "
        "profiles in sparse matrix format, so it is fairly efficient.")

    parser.add_argument('profiles',
                        help="The (mtx) files containing the "
                        "ORF profiles",
                        nargs='+')
    parser.add_argument(
        'out', help="The (mtx.gz) output file containing the merged profiles")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading first ORF profile"
    logger.info(msg)

    merged_profiles = scipy.io.mmread(args.profiles[0]).tocsr()

    msg = "Adding each additional profile"
    logger.info(msg)

    for profile_file in args.profiles[1:]:
        msg = "Reading file: {}".format(profile_file)
        logger.info(msg)

        profiles = scipy.io.mmread(profile_file).tocsr()
        merged_profiles = merged_profiles + profiles

    msg = "Writing merged profiles to disk"
    logger.info(msg)

    math_utils.write_sparse_matrix(args.out, merged_profiles)
Exemple #11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Run the Bayesian optimization-based approach for "
        "training models for algorithm selection.")

    clu.add_config(parser)
    clu.add_scenario(parser)
    clu.add_simple_presolver_options(parser)
    clu.add_num_cpus(parser)
    clu.add_cv_options(parser)

    automl_utils.add_automl_options(parser, default_total_training_time=20)
    automl_utils.add_blas_options(parser)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    if len(args.folds) == 0:
        args.folds = [f for f in range(args.num_folds)]
    clu.validate_folds_options(args)

    required_keys = ['base_path']
    config = as_asl_utils.load_config(args.config, required_keys)

    # check if we need to spawn a new process for blas
    if automl_utils.spawn_for_blas(args):
        return

    pipeline = parallel.apply_parallel_iter(args.folds,
                                            args.num_cpus,
                                            _outer_cv,
                                            args,
                                            config,
                                            progress_bar=True)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script uses the peptides.txt file from MaxQuant to determine "
        "which predicted ORFs have some proteomics evidence.\n\nIt contains "
        "some hard-coded field names.")
    parser.add_argument('predicted_proteins', help="The (fasta, protein) file of "
        "predicted ORFs")
    parser.add_argument('peptides', help="The peptides.txt file produced by MaxQuant")
    parser.add_argument('out', help="The output (csv.gz) file containing the predicted "
        "ORFs and their coverage")
    
    parser.add_argument('--num-cpus', help="The number of CPUs to use for searching",
        type=int, default=default_num_cpus)

    parser.add_argument('--peptide-filter-field', help="The field to use for filtering "
        "the peptides from MaxQuant", default=default_peptide_filter_field)
    parser.add_argument('--peptide-filter-value', help="All peptides with a value greater "
        "than the filter value will be removed", type=float, default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator', help="The separator in the --peptide file",
        default=default_peptide_separator)

    parser.add_argument('-g', '--num-groups', help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.", type=int, default=default_num_groups)

    parser.add_argument('--num-peptides', help="If n>0, then only the first n peptide "
        "sequences will be used to calculate coverage. This is for testing.", type=int,
        default=default_num_peptides)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[get-orf-peptide-matches]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading and filtering peptides"
    logger.info(msg)

    peptides = pd.read_csv(args.peptides, sep=args.peptide_separator)
    mask_filter = peptides[args.peptide_filter_field] < args.peptide_filter_value
    peptides = peptides[mask_filter]
    peptide_sequences = pd.DataFrame(peptides['Sequence'])

    if args.num_peptides > 0:
        peptide_sequences = peptide_sequences.head(args.num_peptides)

    msg = "Number of filtered peptides: {}".format(len(peptide_sequences))
    logger.info(msg)

    msg = "Reading predicted ORFs into a data frame"
    logger.info(msg)

    # TODO: use read iterator
    predicted_orfs = bio.get_read_iterator(args.predicted_proteins)
    orf_ids = []
    orf_sequences = []

    for orf_id, seq in predicted_orfs:
        orf_ids.append(orf_id)
        orf_sequences.append(seq)

    predicted_orfs_df = pd.DataFrame()
    predicted_orfs_df['orf_id'] = orf_ids
    predicted_orfs_df['orf_sequence'] = orf_sequences

    msg = "Searching for matching peptides"
    logger.info(msg)

    peptide_matches = parallel.apply_parallel_split(peptide_sequences, args.num_cpus, 
        find_matching_orfs_group, predicted_orfs_df, progress_bar=True, num_groups=args.num_groups)

    # filter out the Nones to avoid DataFrame conversion problems
    msg = "Joining results back into large data frame"
    logger.info(msg)

    peptide_matches = [pm for pm in peptide_matches if pm is not None]
    peptide_matches = pd.concat(peptide_matches)

    # now, we have a data frame of matches (fields: peptide, orf_id)
    msg = "Getting peptide coverage of ORFs"
    logger.info(msg)

    # first, count the matches for each ORF
    peptide_matches_groups = peptide_matches.groupby('orf_id')

    orf_matches = parallel.apply_parallel_groups(peptide_matches_groups, args.num_cpus, 
        count_matches, progress_bar=True)
    orf_matches = pd.DataFrame(orf_matches)

    # then join back on the original list of ORFs to have entries for ORFs
    # with no peptide matches
    predicted_orf_coverage = pd.merge(predicted_orfs_df, orf_matches, on='orf_id', how="left")

    # and patch the holes in the data frame
    predicted_orf_coverage = predicted_orf_coverage.fillna(0)

    msg = "Writing coverage information to disk"
    pandas_utils.write_df(predicted_orf_coverage, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This is a helper script which submits a set of samples to SLURM. It "
        "can also be used to run a set of samples sequentially. Due to limitations on "
        "the config file specification, all of the samples must use the same reference "
        "indices (i.e., genome sequence, set of ORFs, etc.).")

    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--tmp', help="The temp directory", default=default_tmp)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)

    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--profiles-only', help="If this flag is present, then only "
        "the pre-processing part of the pipeline will be called, i.e. profiles "
        "will be created for each sample specified in the config file, but no predictions"
        "will be made.", action='store_true')

    parser.add_argument('--merge-replicates', help="If this flag is present, then "
        "the ORF profiles from the replicates will be merged before making the final "
        "predictions", action='store_true')

    parser.add_argument('--run-replicates', help="If this flag is given with the "
        "--merge-replicates flag, then both the replicates *and* the individual "
        "samples will be run. This flag has no effect if --merge-replicates is not "
        "given.", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
    
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [
                    'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles',
                    'estimate-orf-bayes-factors',
                    'select-final-prediction-set',
                    'create-orf-profiles',
                    'predict-translated-orfs',
                    'run-rpbp-pipeline'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
                        'riboseq_data',
                        'riboseq_samples',
                        'ribosomal_index',
                        'star_index',
                        'genome_base_path',
                        'genome_name',
                        'fasta',
                        'gtf'
                    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # check if we only want to create the profiles, in this case
    # we call run-rpbp-pipeline with the --profiles-only option
    profiles_only_str = ""
    if args.profiles_only:
        args.merge_replicates = False
        profiles_only_str = "--profiles-only"
        msg = ("The --profiles-only option was given, this will override --merge-replicates "
               "and/or --run-replicates, if these options were also given!")
        logger.info(msg)

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles, but we still make predictions
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = ("The --run-replicates option was given without the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)
    
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)
    
    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    
    # collect the job_ids in case we are using slurm and need to merge replicates
    job_ids = []

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, 
            args.config, 
            sample_name, 
            args.num_cpus, 
            tmp_str, 
            do_not_call_str, 
            overwrite_str, 
            logging_str, 
            star_str, 
            profiles_only_str,
            flexbar_option_str,
            keep_intermediate_str,
            mem_str
        )

        job_id = slurm.check_sbatch(cmd, args=args)

        job_ids.append(job_id)

    # now, if we are running the "standard" pipeline, we are finished
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):
    
        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, 
            condition_name, 
            args.num_cpus, 
            do_not_call_str, 
            overwrite_str, 
            logging_str, 
            merge_replicates_str
        )

        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Exemple #14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts bam files to bigWig files. It is mostly "
        "a port of this script: https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py "
        "by Brad Chapman which avoids a few dependencies.\n\nThe wigToBigWig "
        "program (from UCSC tools) must be in the path.\n\nN.B. If given, the "
        "start and end coordinates must be base-0.")

    parser.add_argument('bam', help="The bam file", nargs='+')
    parser.add_argument(
        '-o',
        '--overwrite',
        help="If this flag is given, then "
        "the bigWig file will be created whether it exists or not",
        action='store_true')
    parser.add_argument('-c',
                        '--chrom',
                        help="If specified, only alignments "
                        "from this chromosome will be in the output",
                        default=default_chrom)
    parser.add_argument('-s',
                        '--start',
                        help="If specied, only alignments "
                        "from this position will be in the output",
                        default=default_start)
    parser.add_argument('-e',
                        '--end',
                        help="If specied, only alignments "
                        "up to this position will be in the output",
                        default=default_end)

    parser.add_argument('-n',
                        '--normalize',
                        help="If this flag is given, "
                        "then values will be normalized to reads per million",
                        action='store_true')

    parser.add_argument(
        '-t',
        '--use-tempfile',
        help="If this flag is given, "
        "then a temp file will be used to avoid permission issues",
        action='store_true')

    parser.add_argument('-k',
                        '--keep-wig',
                        help="If this flag is given, then "
                        "the wiggle file will not be deleted",
                        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['wigToBigWig']
    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    parallel.apply_parallel_iter(args.bam,
                                 args.num_cpus,
                                 bam_to_wiggle,
                                 args,
                                 progress_bar=True)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs the Rp-Bp and Rp-chi pipelines on a given sample. "
        "It requires a YAML config file that includes a number of keys. Please see the "
        "documentation for a complete description.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp', help="The temp directory", default=default_tmp)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)
    
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--profiles-only', help="If this flag is present, then only "
        "the ORF profiles will be created", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
           
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call


    # check that all of the necessary programs are callable
    programs =  [
                    'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles',
                    'estimate-orf-bayes-factors',
                    'select-final-prediction-set',
                    'create-orf-profiles',
                    'predict-translated-orfs'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
                        'riboseq_data',
                        'ribosomal_index',
                        'star_index',
                        'genome_base_path',
                        'genome_name',
                        'fasta',
                        'gtf'
                    ]
    utils.check_keys_exist(config, required_keys)

    
    # now, check if we want to use slurm
    msg = "use_slurm: {}".format(args.use_slurm)
    logger.debug(msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    note_str = config.get('note', None)

    # the first step is the standard riboseq preprocessing
    
    # handle do_not_call so that we _do_ call the preprocessing script, 
    # but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    # for a sample, we first create its filtered genome profile
    
    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(args.raw_data, 
            args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, 
            logging_str, star_str, tmp_str, flexbar_option_str, keep_intermediate_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(args.config, 
            args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str))
    shell_utils.check_call(cmd)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out', help="The base output directory for the latex report")

    parser.add_argument('--show-unfiltered-orfs', help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included", action='store_true')
    
    parser.add_argument('--show-orf-periodicity', help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.", action='store_true')

    parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", 
        default=default_uniprot)
    parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in "
        "the plot", default=default_uniprot_label)
    
    parser.add_argument('--image-type', help="The format of the image files. This must be "
        "a format usable by matplotlib.", default=default_image_type)

    parser.add_argument('--overwrite', help="If this flag is present, existing files will "
        "be overwritten.", action='store_true')
        
    parser.add_argument('--note', help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note)

    parser.add_argument('--show-chisq', help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.", action='store_true')

    parser.add_argument('-t', '--tmp', help="A location for temporary files",
        default=default_tmp)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs =  [   
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart',
        'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)
    
    required_keys = [
        'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]
    
    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note
    
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    
    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())
        
        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:
            
            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = ("Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue
            
            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page            
            it = itertools.product(grouped_values, chisq_values, filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations
                
                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'], 
                    sample_name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=out_note_str, 
                    image_type=args.image_type,
                    fraction=f, 
                    reweighting_iterations=rw,
                    is_grouped=is_grouped, 
                    is_chisq=is_chisq, 
                    is_filtered=is_filtered
                )

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i%6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_types_bar_chart)
                    logger.warning(msg)



            if (i > 0) and (i%6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%6 != 0:
            latex.clearpage(out)

    
        # now, if the config file specifies replicates, create figures for those                
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values, filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                
                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'], 
                    replicate_name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered
                )

                
                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i%6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_types_bar_chart)
                    logger.warning(msg)


            if (i > 0) and (i%6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%6 != 0:
            latex.clearpage(out)


        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:
            
            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = ("Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue
            
            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)
            
            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'], 
                    sample_name,
                    length=lengths,
                    offset=offsets, 
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq
                )

                if os.path.exists(orf_length_line_graph):
            
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_length_line_graph, height=0.15)

                    if i%4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_length_line_graph)
                    logger.debug(msg)


            if (i > 0) and (i%4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those  
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)
            
            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets, 
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq
                )
                
                if os.path.exists(orf_length_line_graph):
            
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_length_line_graph, height=0.15)

                    if i%4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i%4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)
            
            i = 0
            for sample_name in sample_names:
                
                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = ("Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue
                
                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'], 
                        sample_name, 
                        length=lengths, offset=offsets, 
                        is_unique=is_unique, 
                        note=out_note_str,
                        fraction=f, reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, 
                            orf_type, 
                            strand, 
                            args.image_type
                        )

                        msg = "Looking for image file: {}".format(orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i%4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i%4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i%4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i%4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None
                            
                caption = "ORF type metagene profiles: {}".format(replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'], 
                        replicate_name, 
                        length=lengths, offset=offsets, 
                        is_unique=is_unique, 
                        note=out_note_str, 
                        fraction=f, reweighting_iterations=rw,
                        is_chisq=is_chisq
                    )

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, 
                            orf_type, 
                            strand, 
                            args.image_type
                        )

                        if os.path.exists(orf_type_profile):

                            if is_first or (i%4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(orf_type_profile)
                            logger.debug(msg)
                
                if (i > 0) and (i%4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i%4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)
Exemple #17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script uses the peptides.txt file from MaxQuant to determine "
        "which predicted ORFs have some proteomics evidence.\n\nIt contains "
        "some hard-coded field names.")
    parser.add_argument('predicted_proteins',
                        help="The (fasta, protein) file of "
                        "predicted ORFs")
    parser.add_argument('peptides',
                        help="The peptides.txt file produced by MaxQuant")
    parser.add_argument(
        'out',
        help="The output (csv.gz) file containing the predicted "
        "ORFs and their coverage")

    parser.add_argument('--num-cpus',
                        help="The number of CPUs to use for searching",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for filtering "
                        "the peptides from MaxQuant",
                        default=default_peptide_filter_field)
    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value greater "
                        "than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the --peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.",
        type=int,
        default=default_num_groups)

    parser.add_argument(
        '--num-peptides',
        help="If n>0, then only the first n peptide "
        "sequences will be used to calculate coverage. This is for testing.",
        type=int,
        default=default_num_peptides)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[get-orf-peptide-matches]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading and filtering peptides"
    logger.info(msg)

    peptides = pd.read_csv(args.peptides, sep=args.peptide_separator)
    mask_filter = peptides[
        args.peptide_filter_field] < args.peptide_filter_value
    peptides = peptides[mask_filter]
    peptide_sequences = pd.DataFrame(peptides['Sequence'])

    if args.num_peptides > 0:
        peptide_sequences = peptide_sequences.head(args.num_peptides)

    msg = "Number of filtered peptides: {}".format(len(peptide_sequences))
    logger.info(msg)

    msg = "Reading predicted ORFs into a data frame"
    logger.info(msg)

    # TODO: use read iterator
    predicted_orfs = bio.get_read_iterator(args.predicted_proteins)
    orf_ids = []
    orf_sequences = []

    for orf_id, seq in predicted_orfs:
        orf_ids.append(orf_id)
        orf_sequences.append(seq)

    predicted_orfs_df = pd.DataFrame()
    predicted_orfs_df['orf_id'] = orf_ids
    predicted_orfs_df['orf_sequence'] = orf_sequences

    msg = "Searching for matching peptides"
    logger.info(msg)

    peptide_matches = parallel.apply_parallel_split(peptide_sequences,
                                                    args.num_cpus,
                                                    find_matching_orfs_group,
                                                    predicted_orfs_df,
                                                    progress_bar=True,
                                                    num_groups=args.num_groups)

    # filter out the Nones to avoid DataFrame conversion problems
    msg = "Joining results back into large data frame"
    logger.info(msg)

    peptide_matches = [pm for pm in peptide_matches if pm is not None]
    peptide_matches = pd.concat(peptide_matches)

    # now, we have a data frame of matches (fields: peptide, orf_id)
    msg = "Getting peptide coverage of ORFs"
    logger.info(msg)

    # first, count the matches for each ORF
    peptide_matches_groups = peptide_matches.groupby('orf_id')

    orf_matches = parallel.apply_parallel_groups(peptide_matches_groups,
                                                 args.num_cpus,
                                                 count_matches,
                                                 progress_bar=True)
    orf_matches = pd.DataFrame(orf_matches)

    # then join back on the original list of ORFs to have entries for ORFs
    # with no peptide matches
    predicted_orf_coverage = pd.merge(predicted_orfs_df,
                                      orf_matches,
                                      on='orf_id',
                                      how="left")

    # and patch the holes in the data frame
    predicted_orf_coverage = predicted_orf_coverage.fillna(0)

    msg = "Writing coverage information to disk"
    utils.write_df(predicted_orf_coverage, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all of the peptides which match to multiple "
        "ORFs from the results found with get-all-orf-peptide-matches.")

    parser.add_argument('peptide_matches', help="The peptide matches file produced "
        "by get-all-orf-peptide-matches")
    parser.add_argument('out', help="A similar peptide matches file which "
        "contains only peptides which match to a unique ORF")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading peptide matches"
    logger.info(msg)

    peptide_matches = pd.read_csv(args.peptide_matches)

    msg = "Splitting the grouped matches into individual peptide matches"
    logger.info(msg)

    matches = parallel.apply_parallel(  peptide_matches, 
                                        args.num_cpus, 
                                        parse_matches, 
                                        progress_bar=True)

    msg = "Removing peptides which match to multiple ORFs"
    logger.info(msg)

    matches = utils.remove_nones(matches)
    matches = utils.flatten_lists(matches)
    matches_df = pd.DataFrame(matches)
    unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False)

    msg = "Merging the ORF-peptide matches back to single records"
    logger.info(msg)

    unique_groups = unique_matches_df.groupby('orf_id')
    merged_unique_groups = parallel.apply_parallel_groups(  unique_groups, 
                                                            args.num_cpus, 
                                                            merge_group, 
                                                            progress_bar=True)

    merged_unique_df = pd.DataFrame(merged_unique_groups)

    msg = "Re-adding the ORFs which no longer have peptide matches"
    logger.info(msg)

    m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id'])
    peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0
    peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0

    peps = [merged_unique_df, peptide_matches[~m_still_has_match]]
    merged_unique_df = pd.concat(peps)

    msg = "Writing the ORFs with unique matches to disk"
    logger.info(msg)

    pandas_utils.write_df(merged_unique_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a stacked bar chart of the uniquely- "
        "and multi-mapping read counts found by count-aligned-reads")

    parser.add_argument('read_counts',
                        help="The counts csv file from "
                        "count-aligned-reads")
    parser.add_argument('out', help="The output image file")

    parser.add_argument('--y-tick-frequency',
                        help="The frequency of the ticks "
                        "on the y-axis",
                        type=int,
                        default=default_y_tick_frequency)

    parser.add_argument('-t',
                        '--title',
                        help="The title for the plot",
                        default=default_title)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading count"
    logger.info(msg)
    read_counts = pd.read_csv(args.read_counts)

    msg = "Extracting bar heights"
    logger.info(msg)

    fields = ['uniquely_aligned_reads', 'multimapping_reads']
    pretty_fields = ['Unique', 'Multimapping']
    read_counts_np = read_counts[fields].values

    msg = "Determing y-ticks"
    logger.info(msg)

    ytm = np.max(read_counts['aligned_reads']) / args.y_tick_frequency
    y_tick_max = round(ytm) * args.y_tick_frequency
    y_ticks = np.arange(0, y_tick_max, args.y_tick_frequency)
    y_tick_labels = ["{:.1e}".format(y) for y in y_ticks]

    msg = "Plotting the bars"
    logger.info(msg)

    x_tick_labels = read_counts['dataset']

    fig, ax = plt.subplots()

    mpl_utils.create_stacked_bar_plot(ax,
                                      read_counts_np,
                                      stack_labels=pretty_fields,
                                      colors=plt.cm.Blues,
                                      edge_colors='k',
                                      gap=0.1,
                                      end_gaps=True,
                                      y_ticks=y_ticks,
                                      y_tick_labels=y_tick_labels,
                                      x_tick_labels=x_tick_labels)

    if args.title is not None:
        ax.set_title(args.title)

    msg = "Saving plot to disk"
    logger.info(msg)

    fig.savefig(args.out, bbox_inches='tight')
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently. One sparse matrix file will be created for "
        "each read length. It then collects the values into a sparse tensor.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name',
                        help="The name for the dataset, used in the "
                        "created files")

    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles and read lengths")

    parser.add_argument(
        '-c',
        '--is-condition',
        help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.",
        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)

    orfs = filenames.get_orfs(config['genome_base_path'],
                              config['genome_name'],
                              note=orf_note)

    exons = filenames.get_exons(config['genome_base_path'],
                                config['genome_name'],
                                note=orf_note)

    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # check which samples to process
    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)

        # now the relevant files
        bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                        name,
                                        is_unique=is_unique,
                                        note=note)

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, name, is_unique=is_unique)

        if len(lengths) == 0:
            msg = (
                "No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting."
            )
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            mtx = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                 name,
                                                 length=[length],
                                                 offset=[offset],
                                                 is_unique=is_unique,
                                                 note=note)

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str,
                cpus_str, logging_str)

            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    # now, collect them into a single file
    offsets_str = ' '.join(str(o) for o in offsets)
    lengths_str = ' '.join(str(l) for l in lengths)

    offsets_str = "--offsets {}".format(offsets_str)
    lengths_str = "--lengths {}".format(lengths_str)

    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config, args.name, args.out, is_condition_str, logging_str)

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Exemple #21
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Given a meme motif file, extract the gene names and map "
        "them to ensembl identifiers using pyensembl. The pyensembl database "
        "information can be given either in a yaml config file or as command "
        "line options. The yaml config file values have precedence over the "
        "command line options.")

    parser.add_argument('meme', help="The meme file")
    parser.add_argument('out', help="The output file")

    parser.add_argument(
        '-c',
        '--config',
        help="The yaml config file. If "
        "given, this should include keys 'genome_name' and 'gtf'. Otherwise, "
        "they may be specified using the respective command line options.",
        default=None)

    parser.add_argument('-n',
                        '--genome-name',
                        help="The genome_parameter for "
                        "retrieving the pyensembl database",
                        default=None)
    parser.add_argument('-g',
                        '--gtf',
                        help="The gtf file for pyensembl",
                        default=None)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # if the config file was given, use any values in it to replace those
    # passed on the command line
    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))

        args.genome_name = config.get('genome_name', args.genome_name)
        args.gtf = config.get('gtf', args.gtf)

    msg = "genome_name: {}".format(args.genome_name)
    logger.debug(msg)

    msg = "gtf: {}".format(args.gtf)
    logger.debug(msg)

    msg = "Loading pyensembl database"
    logger.info(msg)

    ensembl = pyensembl.Genome(reference_name=args.genome_name,
                               annotation_name="ensembl",
                               gtf_path_or_url=args.gtf)

    # this will create the database if needed
    ensembl.index()

    msg = "Parsing motif gene names"
    logger.info(msg)

    # a line from CISBP looks like:
    #   MOTIF M002_0.6 (Ankhd1)_(Homo_sapiens)_(RBD_1.00)

    all_motifs = []

    motif_re = ("\((?P<gene_name>[^\)]+)\)_\((?P<species>[^\)]+)\)_"
                "\((?P<rbd_score>[^\)]+)\)")
    motif_re = re.compile(motif_re)

    with open(args.meme) as meme_f:
        for line in meme_f:
            if line.startswith("MOTIF"):
                (key, motif_name, info) = line.split()

                m = motif_re.match(info)

                if m is None:
                    msg = ("Could not parse gene name. Guessing the entire "
                           "string is the gene name: '{}'.".format(info))
                    logger.warning(msg)
                    gene_name = info
                else:
                    gene_name = m.group("gene_name")

                try:
                    ensembl_ids = ensembl.gene_ids_of_gene_name(gene_name)
                except ValueError:
                    msg = ("Could not find Ensembl identifier for gene_name: "
                           "'{}'".format(gene_name))
                    logger.warning(msg)
                    ensembl_ids = [gene_name]

                for ensembl_id in ensembl_ids:
                    motif = {
                        "motif_name": motif_name,
                        "gene_name": gene_name,
                        "ensembl_id": ensembl_id
                    }

                    all_motifs.append(motif)

    msg = "Joining motif gene names into large data frame"
    logger.info(msg)
    all_motifs_df = pd.DataFrame(all_motifs)

    msg = "Writing motifs to disk"
    logger.info(msg)
    utils.write_df(all_motifs_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text.")
    parser.add_argument('config', help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument('--show-orf-periodicity', help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.", action='store_true')

    parser.add_argument('--show-read-length-bfs', help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.", action='store_true')

    parser.add_argument('--overwrite', help="If this flag is present, existing files will "
        "be overwritten.", action='store_true')

    parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this "
        "number of reads will not be included in the report.", type=int, 
        default=default_min_visualization_count)

    parser.add_argument('--image-type', help="The type of image types to create. This "
        "must be an extension which matplotlib can interpret.", default=default_image_type)

    parser.add_argument('-c', '--create-fastqc-reports', help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.", action='store_true')
    
    parser.add_argument('--tmp', help="If the fastqc reports are created, "
        "they will use this location for temp files", default=default_tmp)
     
    parser.add_argument('--note', help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    if args.note is not default_note:
        config['note'] = args.note
    note = config.get('note', None)
    
    sample_names = sorted(config['riboseq_samples'].keys())
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs =  [   
	'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts',
        'samtools',
        'visualize-read-filtering-counts',
        'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc','java'])
        
    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)
   
    # first, create the read filtering information... 
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
            name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        "min_metagene_profile_count", default_min_metagene_profile_count)

    min_metagene_profile_bayes_factor_mean = config.get(
        "min_metagene_profile_bayes_factor_mean", 
        default_min_metagene_profile_bayes_factor_mean)

    max_metagene_profile_bayes_factor_var = config.get(
        "max_metagene_profile_bayes_factor_var", 
        default_max_metagene_profile_bayes_factor_var)

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)
    
        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out, read_filtering_caption, label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out, "Read length distributions", 
            label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'], name, is_unique=False, note=note, 
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'], name, is_unique=True, note=note, 
                image_type=args.image_type)

            
            msg = "Looking for image file: {}".format(read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out, read_length_distribution_image, width=0.45)
            else:
                msg = "Could not find image: {}".format(read_length_distribution_image)
                logger.warning(msg)
                
                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)
            

            msg = "Looking for image file: {}".format(unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out, unique_read_length_distribution_image, width=0.45)
            else:
                msg = "Could not find image: {}".format(unique_read_length_distribution_image)
                logger.warning(msg)
            
                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

            

        latex.end_table(out)
        latex.clearpage(out)


        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))
    
            periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
                name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())
    

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used
                
                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"
                
                if length_row['highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean:
                    offset_status = "BF mean too small"

                if length_row['highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var:
                    offset_status = "BF variance too high"

                if length_row['highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"
                
                if length_row['highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'], name, image_type=args.image_type, 
                    is_unique=is_unique, length=length, note=note)
                
                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                    #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)
                               
                i += 1
                if i%2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)


                if args.show_read_length_bfs:
                    
                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'], name, image_type=args.image_type, 
                        is_unique=is_unique, length=length, note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)
                        
                    i += 1
                    if i%2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

                               
            if i%2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)
            
            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = ("Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue
                
                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'], sample_name, length=lengths, offset=offsets, 
                    is_unique=is_unique, note=note, subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand, 
                            image_type=args.image_type)


                        msg = "Looking for image file: {}".format(orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i%4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i%4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i>0) and (i%4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(), 
            args.num_cpus, 
            create_fastqc_reports, config, args)
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs the second part of the pipeline: it estimate ORF Bayes"
            "factors using the ORF profiles, then make the final prediction set.")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
        type=int, default=default_num_cpus)
    
    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--merge-replicates', help="If this flag is present, then the ORF "
        "profiles will be merged for all replicates in the condition given by <name>. The "
        "filenames, etc., will reflect the condition name, but not the lengths and offsets "
        "of the individual replicates.\n\nN.B. If this flag is is present, the --overwrite "
        "flag will automatically be set!", action='store_true')
        
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[predict_translated_orfs]: {}".format(' '.join(sys.argv))
    logger.debug(msg)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [   
        'estimate-orf-bayes-factors',
        'select-final-prediction-set'
    ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
        'riboseq_data',
        'fasta',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    models_base = config.get('models_base', default_models_base)

    note_str = config.get('note', None)

    # we always need the ORFs
    orfs_genomic = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=config.get('orf_note')
    )

    # smoothing parameters (filenames)
    # default values are not used in the file names
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    # check if we are running Rp-Bp (default) or Rp-chi
    chi_square_only_str = ""
    chi_square_only = False
    if 'chi_square_only' in config:
        chi_square_only_str = "--chi-square-only"
        chi_square_only = True
        fraction = None
        reweighting_iterations = None
        msg = """ The final prediction set will be made based on the chi square test only! 
                  The translation models will not be fit to the data, and the posterior 
                  distributions will not be estimated. """
        logger.info(msg)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # first, check if we are merging replicates

    # either way, the following variables need to have values for the rest of
    # the pipeline: lengths, offsets, smooth_profiles
    if args.merge_replicates:
        msg = ("The --merge-replicates option was given, so --overwrite is "
            "being set to True.")
        logger.warning(msg)
        args.overwrite = True

        # now, actually merge the replicates
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)

        # we will not use the lengths and offsets in the filenames
        lengths = None
        offsets = None

        # we will also merge all of unsmoothed profiles
        replicate_profiles = [
            get_profile(name, config, args) 
                for name in riboseq_replicates[args.name]
        ]

        replicate_profiles_str = ' '.join(replicate_profiles)

        profiles = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, 
            length=lengths, offset=offsets, is_unique=is_unique, note=note_str)

        cmd = "merge-replicate-orf-profiles {} {} {}".format(replicate_profiles_str,
            profiles, logging_str)
        in_files = replicate_profiles
        out_files = [profiles]

        # todo: implement file checker for mtx files
        shell_utils.call_if_not_exists(
            cmd, 
            out_files, 
            in_files=in_files, 
            overwrite=args.overwrite, 
            call=call
        )


    else:
        # otherwise, just treat things as normal
        # get the lengths and offsets which meet the required criteria from 
        # the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, 
            args.name, args.do_not_call, is_unique=is_unique)
        
        profiles = get_profile(args.name, config, args)
        
    # estimate the bayes factors
    bayes_factors = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'], 
        args.name, 
        length=lengths, 
        offset=offsets, 
        is_unique=is_unique, 
        note=note_str, 
        fraction=fraction, 
        reweighting_iterations=reweighting_iterations
    )

    # the smoothing options
    min_length_str = utils.get_config_argument(config, 'min_orf_length', 'min-length')
    max_length_str = utils.get_config_argument(config, 'max_orf_length', 'max-length')
    min_profile_str = utils.get_config_argument(config, 'min_signal', 'min-profile')

    fraction_str = utils.get_config_argument(config, 'smoothing_fraction', 'fraction')
    reweighting_iterations_str = utils.get_config_argument(config, 
        'smoothing_reweighting_iterations', 'reweighting-iterations')
    
    # parse out all of the options from the config file, if they are present
    translated_models = filenames.get_models(models_base, 'translated')
    untranslated_models = filenames.get_models(models_base, 'untranslated')

    translated_models_str = ' '.join(translated_models)
    untranslated_models_str = ' '.join(untranslated_models)

    translated_models_str = "--translated-models {}".format(
        translated_models_str)
    untranslated_models_str = "--untranslated-models {}".format(
        untranslated_models_str)
    
    orf_types_str = utils.get_config_argument(config, 'orf_types')
    
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains', 'chains')
    iterations_str = utils.get_config_argument(config, 'translation_iterations', 'iterations')


    cmd = ("estimate-orf-bayes-factors {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} "
        "--num-cpus {}".format(
        profiles, 
        orfs_genomic, 
        bayes_factors, 
        translated_models_str, 
        untranslated_models_str, 
        logging_str, 
        orf_types_str, 
        min_length_str, 
        max_length_str, 
        min_profile_str, 
        fraction_str, 
        reweighting_iterations_str,
        seed_str, 
        iterations_str, 
        chains_str, 
        chi_square_only_str, 
        args.num_cpus)
    )
    
    in_files = [profiles, orfs_genomic]
    in_files.extend(translated_models)
    in_files.extend(untranslated_models)
    out_files = [bayes_factors]
    file_checkers = {
        bayes_factors: utils.check_gzip_file
    }
    msg = "estimate-bayes-factors in_files: {}".format(in_files)
    logger.debug(msg)
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    for is_filtered in [True, False]:
            
        filtered_str = ""
        if is_filtered:
            filtered_str = "--select-longest-by-stop --select-best-overlapping"

        # now, select the ORFs (longest for each stop codon) which pass the prediction filters
        predicted_orfs = filenames.get_riboseq_predicted_orfs(
            config['riboseq_data'], 
            args.name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=note_str, 
            fraction=fraction, 
            reweighting_iterations=reweighting_iterations,
            is_filtered=is_filtered, 
            is_chisq=chi_square_only
        )

        predicted_orfs_dna = filenames.get_riboseq_predicted_orfs_dna(
            config['riboseq_data'], 
            args.name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=note_str, 
            fraction=fraction, 
            reweighting_iterations=reweighting_iterations,
            is_filtered=is_filtered, 
            is_chisq=chi_square_only
        )

        predicted_orfs_protein = filenames.get_riboseq_predicted_orfs_protein(
            config['riboseq_data'], 
            args.name, 
            length=lengths, 
            offset=offsets, 
            is_unique=is_unique, 
            note=note_str,
            fraction=fraction, 
            reweighting_iterations=reweighting_iterations,
            is_filtered=is_filtered, 
            is_chisq=chi_square_only
        )

        min_bf_mean_str = utils.get_config_argument(config, 'min_bf_mean')
        max_bf_var_str = utils.get_config_argument(config, 'max_bf_var')
        min_bf_likelihood_str = utils.get_config_argument(config, 'min_bf_likelihood')
    
        chisq_significance_level_str = utils.get_config_argument(config, 'chisq_significance_level')
        min_profile_str = utils.get_config_argument(config, 'min_signal', 'minimum-profile-sum')

        cmd = "select-final-prediction-set {} {} {} {} {} {} {} {} {} {} {}".format(
            bayes_factors, 
            config['fasta'], 
            predicted_orfs, 
            predicted_orfs_dna, 
            predicted_orfs_protein,
            min_bf_mean_str, 
            max_bf_var_str, 
            min_bf_likelihood_str, 
            logging_str, 
            chi_square_only_str,
            filtered_str
        )

        in_files = [bayes_factors, config['fasta']]
        out_files = [
            predicted_orfs, 
            predicted_orfs_dna, 
            predicted_orfs_protein
        ]

        file_checkers = {
            predicted_orfs: utils.check_gzip_file
        }

        # todo: implement file checker for fasta files
        shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
            file_checkers=file_checkers, overwrite=args.overwrite, call=call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')
    
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [
                 'extract-orf-coordinates',
                 'label-orfs',
                 'bowtie2-build-s',
                 'split-bed12-blocks',
                 'gtf-to-bed12',
                 args.star_executable
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   'genome_base_path',
                        'genome_name',
                        'gtf',
                        'fasta',
                        'ribosomal_fasta',
                        'ribosomal_index',
                        'star_index'
                    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [
        config['gtf'],
        config['fasta'],
        config['ribosomal_fasta']
    ]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return
   
    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], 
        config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)
    
    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
        "--runThreadN {} --limitGenomeGenerateRAM {}".format(args.star_executable, 
        config['star_index'], config['fasta'], args.num_cpus, mem))
        
    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), is_annotated=True,
        is_de_novo=False)
   
    annotated_exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), 
        is_annotated=True, is_de_novo=False, is_orf=True)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), is_orf=True)

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
        config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True)
   
    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, 
            is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'), is_annotated=False,
            is_de_novo=True)
       
        de_novo_exons_file = filenames.get_exons(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'), 
            is_annotated=False, is_de_novo=True, is_orf=True)


        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        
        exons_files = [annotated_exons_file, de_novo_exons_file]
        
        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True)
            fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start']
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                overwrite=args.overwrite, call=call)
        else:
            msg = ("Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                  "for reference and do novo annotations. Symlink to reference annotations created.")
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # finally, make sure our files are named correctly
        
        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")
    
    parser.add_argument('--peptide-filter-field', help="The field to use for "
        "filtering the peptides from MaxQuant", default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value', help="All peptides with a value "
        "greater than the filter value will be removed", type=float, 
        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator', help="The separator in the "
        "peptide file", default=default_peptide_separator)

    parser.add_argument('--note', help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.", 
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    programs = [
        'get-orf-peptide-matches'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files',
        'peptide_cell_type_analysis',
        'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items():
        if cell_type not in cell_types:
            msg = ("Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue
            
        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                format(cell_type_protein))
            logger.warning(msg)
            continue
            
        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = ("Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue
                
            peptide_txt_file = config['peptide_files'][peptide_file]
                
            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".
                    format(peptide_txt_file))
                logger.warning(msg)
                continue
                
            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'], cell_type, peptide_file, 
                is_filtered=True, note=out_note_str)
                
            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(cell_type_protein, 
                peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, 
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
            This script uses Hamiltonian MCMC with Stan to estimate translation parameters
            for a set of regions (presumably ORFs). Roughly, it takes as input:

            (1) a set of regions (ORFs) and their corresponding profiles
            (2) a "translated" model which gives the probability that a region is translated
            (3) an "untranslated" model which gives the probability that a region is not translated

            The script first smoothes the profiles using LOWESS. It then calculates
            both the Bayes' factor (using the smoothed profile) and \chi^2 value
            (using the raw counts) for each ORF.
        """
        )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")
    parser.add_argument('regions', help="The regions (ORFs) for which predictions will "
        "be made (BED12+)")
    
    parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi "
        "square test will be performed for each ORF. This can also be a way to get the counts "
        "within each of the ORFs.", action='store_true')
    
    parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+')
    parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+')

    ### filtering options
    parser.add_argument('--orf-types', help="If values are given, then only orfs with "
        "those types are processed.", nargs='*', default=default_orf_types)
    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument('--min-length', help="ORFs with length less than this value will not "
        "be processed", type=int, default=default_min_length)
    parser.add_argument('--max-length', help="ORFs with length greater than this value will not "
        "be processed", type=int, default=default_max_length)
    parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number "
        "of reads) less than this value will not be processed.", type=float, 
        default=default_min_profile)

    ### smoothing options
    parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", 
        type=float, default=default_fraction)

    parser.add_argument('--reweighting-iterations', help="The number of reweighting "
        "iterations to use in LOWESS. Please see the statsmodels documentation for a "
        "detailed description of this parameter.", type=int, default=default_reweighting_iterations)

    ### MCMC options
    parser.add_argument('-s', '--seed', help="The random seeds to use for inference",
        type=int, default=default_seed)
    parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int,
        default=default_chains)
    parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for "
        "each chain", type=int, default=default_iterations)
    
    ### behavior options
    parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed",
        type=int, default=default_num_orfs)
    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will "
        "be written in GZip format", action='store_true')

    parser.add_argument('-g', '--num-groups', help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.", type=int, default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]
    
    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models]
    untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models]
    
    profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)
    
    with suppress_stdout_stderr():
        
        bfs_l = parallel.apply_parallel_split(
            regions, 
            args.num_cpus,
            get_all_bayes_factors_args, 
            num_groups=args.num_groups,
            progress_bar=True
        )

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)
Exemple #27
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Test models learned with train-as-auto-sklearn. It "
        "writes the predictions to disk as a \"long\" data frame. The output "
        "file is in gzipped csv format.")
    
    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('model_template', help="A template string for the filenames for "
        "the learned models. ${solver} and ${fold} are the template part of "
        "the string. It is probably necessary to surround this argument with "
        "single quotes in order to prevent shell replacement of the template "
        "parts.")

    parser.add_argument('out', help="The output csv file")

    parser.add_argument('--config', help="A (yaml) config file which "
        "specifies options controlling the learner behavior")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    msg = "Creating string templates"
    logger.info(msg)
    model_template = string.Template(args.model_template)

    msg = "Finding folds from ASlib scenario"
    logger.info(msg)
        
    folds = [int(i) for i in scenario.cv_data['fold'].unique()]
    folds = sorted(folds)

    msg = "Making predictions"
    logger.info(msg)

    all_predictions = []
    it = itertools.product(scenario.algorithms, folds)
    for solver, fold in it:
        
        model_file = model_template.substitute(solver=solver, fold=fold)
        
        if not os.path.exists(model_file):
            msg = "Could not find model file. Skipping: {}".format(model_file)
            logger.warning(msg)
            continue
        
        try:
            model = joblib.load(model_file)
        except:
            msg = ("Problem loading the model file. Skipping: {}".format(
                model_file))
            logger.warning(msg)
            continue
            
        msg = "Processing. solver: {}. fold: {}".format(solver, fold)
        logger.info(msg)
        
        testing, training = scenario.get_split(fold)
        y_pred = model.predict(testing.feature_data)

        if 'log_performance_data':
            # exp transform it back out
            y_pred = np.expm1(y_pred)
            
        pred_df = pd.DataFrame()
        pred_df['instance_id'] = testing.feature_data.index
        pred_df['solver'] = solver
        pred_df['fold'] = fold
        pred_df['actual'] = testing.performance_data[solver].values
        pred_df['predicted'] = y_pred
        
        all_predictions.append(pred_df)
        
    msg = "Joining all predictions in a long data frame"
    logger.info(msg)
    all_predictions = pd.concat(all_predictions)

    msg = "Writing predictions to disk"
    logger.info(msg)

    utils.write_df(all_predictions, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script uses the mygene.info service to find annotations "
        "for the transcripts associated with the ORFs in the given bed file. In "
        "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, "
        "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.")

    parser.add_argument('bed', help="The bed file")
    parser.add_argument('out', help="The output file. Its type will be inferred "
        "from its extension.")

    parser.add_argument('--do-not-trim', help="By default, the script will "
        "attempt to trim transcript identifiers such that they are valid Ensembl "
        "identifiers. If this flag is given, no trimming will take place.",
        action='store_true')

    parser.add_argument('--scopes', help="A list of scopes to use when querying "
        "mygene.info. Please see the documentation for more information about "
        "valid scopes: http://mygene.info/doc/query_service.html#available_fields",
        nargs='*', default=default_scopes)

    parser.add_argument('--do-not-convert-ids', help="By default, the script will "
        "treat the identifiers in the file as transcript identifiers. It first "
        "maps those to gene identifiers, and then it uses those to find the "
        "gene annotations. If the identifiers are already gene ids (or whatever "
        "is specified by scopes), then the first mapping is not necessary and "
        "can be skipped using this flag.", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    convert_ids = not args.do_not_convert_ids

    msg = "Reading the bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)
    bed = bed[fields_to_keep]

    msg = "Extracting transcript ids"
    logger.info(msg)
    trim = not args.do_not_trim
    orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim)
    orf_ids_df = pd.DataFrame(orf_ids)

    if convert_ids:
        msg = "Querying transcript to gene id mapping"
        logger.info(msg)
        gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id'])
    else:
        gene_ids = pd.DataFrame()
        gene_ids['transcript_id'] = orf_ids_df['transcript_id']
        gene_ids['gene_id'] = orf_ids_df['transcript_id']

    msg = "Querying gene annotations"
    logger.info(msg)
    res_df = mygene_utils.query_mygene(gene_ids['gene_id'])

    msg = "Combining gene annotations with transcript ids"
    logger.info(msg)
    res_df = gene_ids.merge(res_df, on='gene_id', how='inner')

    msg = "Combining transcript annotations with ORF ids"
    logger.info(msg)
    orf_ids_fields = ['transcript_id', 'orf_id']
    res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner')

    msg = "Combining ORF annotations with ORF predictions"
    logger.info(msg)
    res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left')

    msg = "Writing ORF annotations to disk"
    logger.info(msg)
    pandas_utils.write_df(res_df, args.out, index=False)
Exemple #29
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Convert a bed12 file into an equivalent gtf file. In "
        "particular, it uses the \"thick_start\" and \"thick_end\" fields to "
        "determine the CDS gtf entries. It only creates \"exon\" and \"CDS\" "
        "gtf entries. The bed columns after the standard 12 are included as "
        "attributes in the gtf file.")

    parser.add_argument('bed',
                        help="The bed12 file. It must conform to the "
                        "style expected by lifesci.bed_utils.")
    parser.add_argument('out',
                        help="The (output) gtf file. It will conform "
                        "to the style dictated by lifesci.gtf_utils.")

    parser.add_argument('-s',
                        '--source',
                        help="The name to use for the "
                        "\"source\" column in the gtf file",
                        default=default_source)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use "
                        "for conversion",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument(
        '--add-gene-id',
        help="If this flag is present, then "
        "the \"id\" field will also be used as the \"gene_id\"",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)

    if args.add_gene_id:
        msg = "Adding gene_id"
        logger.info(msg)
        bed['gene_id'] = bed['id']

    msg = "Expanding bed entries to gtf entries"
    logger.info(msg)

    gtf_entries = parallel.apply_parallel(bed,
                                          args.num_cpus,
                                          gtf_utils.get_gtf_entries,
                                          args.source,
                                          progress_bar=True)

    msg = "Joining gtf entries into large data frame"
    logger.info(msg)

    gtf_entries = pd.concat(gtf_entries)

    msg = "Sorting gtf entries"
    logger.info(msg)

    gtf_entries = gtf_entries.sort_values(['seqname', 'start', 'end'])
    gtf_entries = gtf_entries.reset_index(drop=True)

    msg = "Writing gtf to disk"
    logger.info(msg)
    gtf_utils.write_gtf(gtf_entries, args.out, compress=False)
Exemple #30
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Collect the individual read length ORF profiles created "
        "by create-read-length-orf-profiles into a single sparse \"tensor\". "
        "N.B. This script is called by create-read-length-orf-profiles, and "
        "it is unlikely that it should ever be called independently.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name',
                        help="The name for the dataset, used in the "
                        "created files")

    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles and read lengths")

    parser.add_argument(
        '-c',
        '--is-condition',
        help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)
    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)

    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    # keep a map from the lengths to the combined profiles
    length_profile_map = {}

    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, name, is_unique=is_unique)

        if len(lengths) == 0:
            msg = (
                "No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting."
            )
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):

            mtx = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                 name,
                                                 length=[length],
                                                 offset=[offset],
                                                 is_unique=is_unique,
                                                 note=note)

            mtx = scipy.io.mmread(mtx).tocsr()

            prior_mtx = length_profile_map.get(length, None)

            if prior_mtx is None:
                length_profile_map[length] = mtx
            else:
                length_profile_map[length] = prior_mtx + mtx

    with gzip.open(args.out, 'wb') as target_gz:

        for length, mtx in length_profile_map.items():
            mtx = mtx.tocoo()

            msg = "Writing ORF profiles. length: {}.".format(length)
            logger.info(msg)

            for row, col, val in zip(mtx.row, mtx.col, mtx.data):
                s = "{} {} {} {}\n".format(length, row, col, val)
                target_gz.write(s.encode())
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently. One sparse matrix file will be created for "
        "each read length. It then collects the values into a sparse tensor.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name', help="The name for the dataset, used in the "
        "created files")
    
    parser.add_argument('out', help="The (mtx.gz) output file containing the "
        "ORF profiles and read lengths")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))
 
    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)

    
    orfs = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=orf_note
    )

    exons = filenames.get_exons(
        config['genome_base_path'], 
        config['genome_name'],
        note=orf_note,
        is_orf=True
    )
    
    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # check which samples to process
    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # now the relevant files
        bam = filenames.get_riboseq_bam(
            config['riboseq_data'], 
            name, 
            is_unique=is_unique, 
            note=note
        )

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return


        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            
            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset],
                is_unique=is_unique, 
                note=note
            )

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam,
                orfs,
                exons,
                mtx,
                lengths_str,
                offsets_str,
                seqname_str,
                cpus_str,
                logging_str
            )
            
            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    # now, collect them into a single file
    offsets_str = ' '.join(str(o) for o in offsets)
    lengths_str = ' '.join(str(l) for l in lengths)

    offsets_str = "--offsets {}".format(offsets_str)
    lengths_str = "--lengths {}".format(lengths_str)

    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config,
        args.name,
        args.out,
        is_condition_str,
        logging_str
    )

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Exemple #32
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all of the peptides which match to multiple "
        "ORFs from the results found with get-all-orf-peptide-matches.")

    parser.add_argument('peptide_matches', help="The peptide matches file produced "
        "by get-all-orf-peptide-matches")
    parser.add_argument('out', help="A similar peptide matches file which "
        "contains only peptides which match to a unique ORF")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading peptide matches"
    logger.info(msg)

    peptide_matches = pd.read_csv(args.peptide_matches)

    msg = "Splitting the grouped matches into individual peptide matches"
    logger.info(msg)

    matches = parallel.apply_parallel(  peptide_matches, 
                                        args.num_cpus, 
                                        parse_matches, 
                                        progress_bar=True)

    msg = "Removing peptides which match to multiple ORFs"
    logger.info(msg)

    matches = utils.remove_nones(matches)
    matches = utils.flatten_lists(matches)
    matches_df = pd.DataFrame(matches)
    unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False)

    msg = "Merging the ORF-peptide matches back to single records"
    logger.info(msg)

    unique_groups = unique_matches_df.groupby('orf_id')
    merged_unique_groups = parallel.apply_parallel_groups(  unique_groups, 
                                                            args.num_cpus, 
                                                            merge_group, 
                                                            progress_bar=True)

    merged_unique_df = pd.DataFrame(merged_unique_groups)

    msg = "Re-adding the ORFs which no longer have peptide matches"
    logger.info(msg)

    m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id'])
    peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0
    peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0

    peps = [merged_unique_df, peptide_matches[~m_still_has_match]]
    merged_unique_df = pd.concat(peps)

    msg = "Writing the ORFs with unique matches to disk"
    logger.info(msg)

    utils.write_df(merged_unique_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Validate the algorithm selection performance of the "
        "predictions made using test-as-auto-sklearn using "
        "autofolio.validation.validate.Validator.")

    parser.add_argument('scenario', help="The ASlib scenario")
    parser.add_argument('predictions',
                        help="The predictions file, from "
                        "test-as-auto-sklearn")

    parser.add_argument('--config',
                        help="A (yaml) config file which "
                        "specifies options controlling the learner behavior")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}
        config['allowed_feature_groups'] = [scenario.feature_group_dict.keys()]

    # either way, update the scenario with the features used during training
    scenario.used_feature_groups = config['allowed_feature_groups']

    msg = "Reading predictions"
    logger.info(msg)
    predictions = pd.read_csv(args.predictions)

    msg = "Selecting the algorithm with smallest prediction for each instance"
    logger.info(msg)

    algorithm_selections = pandas_utils.get_group_extreme(
        predictions, "predicted", ex_type="min", group_fields="instance_id")

    msg = "Creating the schedules for the validator"
    logger.info(msg)

    schedules = parallel.apply_df_simple(algorithm_selections, _get_schedule,
                                         scenario.algorithm_cutoff_time)

    schedules = utils.merge_dicts(*schedules)

    val = Validator()
    performance_type = scenario.performance_type[0]

    if performance_type == "runtime":
        stats = val.validate_runtime(schedules=schedules,
                                     test_scenario=scenario)

    elif performance_type == "solution_quality":
        stats = val.validate_quality(schedules=schedules,
                                     test_scenario=scenario)

    else:
        msg = "Unknown performance type: {}".format(performance_type)
        raise ValueError(msg)

    msg = "=== RESULTS ==="
    logger.info(msg)
    stats.show()
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs all of the processing necessary to produce the "
        "signals used for later processing. In particular, it runs the standard "
        "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with "
        "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes "
        "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq "
        "signal for the most-expressed transcripts.")
    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", 
        default=default_mem)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)

    parser.add_argument('--tmp', help="The location for temp files", default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
            
    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [   'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   'riboseq_data',
                        'ribosomal_index',
                        'gtf',
                        'genome_base_path',
                        'genome_name'
                    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    star_index = filenames.get_star_index(config['genome_base_path'], 
        config['genome_name'], is_merged=False)

    models_base = config.get('models_base', default_models_base)

    # the first step is the standard riboseq preprocessing
    
    # handle do_not_call so that we _do_ call the preprocessing script, 
    # but that it does not run anything
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], 
        note=config.get('orf_note'))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, 
        is_unique=is_unique, note=note)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
        .format(riboseq_raw_data, args.config, args.name, args.num_cpus, 
        do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str,
        flexbar_option_str, keep_intermediate_str, mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    #in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=True) 

    # create the metagene profiles
    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], 
        args.name, is_unique=is_unique, note=note)

    start_upstream_str = utils.get_config_argument(config, 
        'metagene_profile_start_upstream', 'start-upstream')
    start_downstream_str = utils.get_config_argument(config, 
        'metagene_profile_start_downstream', 'start-downstream')
    end_upstream_str = utils.get_config_argument(config, 
        'metagene_profile_end_upstream', 'end-upstream')
    end_downstream_str = utils.get_config_argument(config, 
        'metagene_profile_end_downstream', 'end-downstream')

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'], 
        config['genome_name'], is_merged=False, is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}"
        .format(riboseq_bam_filename, transcript_bed, metagene_profiles, 
        args.num_cpus, logging_str, start_upstream_str,
        start_downstream_str, end_upstream_str, end_downstream_str))

    in_files = [riboseq_bam_filename, orfs_genomic]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], args.name, is_unique=is_unique, note=note)

    #periodic_models_str = utils.get_config_argument(config, 'periodic_models')
    #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models')
    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)
    
    periodic_offset_start_str = utils.get_config_argument(config, 'periodic_offset_start')
    periodic_offset_end_str = utils.get_config_argument(config, 'periodic_offset_end')
    metagene_profile_length_str = utils.get_config_argument(config, 'metagene_profile_length')
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains')
    iterations_str = utils.get_config_argument(config, 'metagene_profile_iterations', 'iterations')

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
        "{} {} {} {} {} {} {}".format(metagene_profiles, 
        metagene_profile_bayes_factors, args.num_cpus, 
        periodic_models_str, non_periodic_models_str,
        periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str,
        seed_str, chains_str, iterations_str, logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
        args.name, is_unique=is_unique, note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors, 
        periodic_offsets)
    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, 
        args.name, args.do_not_call, is_unique=is_unique)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
            "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
            "and/or min_metagene_bf_likelihood. Qutting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, 
        is_unique=is_unique, note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, 
        length=lengths, offset=offsets, is_unique=is_unique, note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], 
        note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'],
        note=config.get('orf_note'), is_orf=True)

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
            unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, 
            offsets_str, logging_str, seqname_prefix_str, args.num_cpus))
    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    #todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)
Exemple #35
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Train a model on an ASlib scenario and use it to create "
        "a solver schedule for an OASC test scenario")

    clu.add_config(parser)
    clu.add_oasc_scenario_options(parser)
    clu.add_num_cpus(parser)
    clu.add_cv_options(parser)
    clu.add_scheduler_options(parser)

    automl_utils.add_automl_options(parser)
    automl_utils.add_blas_options(parser)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we need to spawn a new process for blas
    if automl_utils.spawn_for_blas(args):
        return

    # make sure our arguments were valid
    clu.validate_folds_options(args)
    if args.max_feature_steps < 1:
        args.max_feature_steps = np.inf

    if len(args.folds) == 0:
        args.folds = list(range(1, 11))

    required_keys = ['base_path']
    config = as_asl_utils.load_config(args.config, required_keys)

    training_scenario = automl_utils.load_scenario(args.training_scenario,
                                                   return_name=False)
    testing_scenario = OascTestScenario(args.testing_scenario)

    as_asl_scheduler = ASaslScheduler(args, config)
    as_asl_scheduler_fit = as_asl_scheduler.fit(training_scenario)

    msg = "Writing the scheduler to disk"
    logger.info(msg)

    model_type = "asl.scheduler"
    if args.use_random_forests:
        model_type = "rf.scheduler"

    scheduler_filename = filenames.get_model_filename(
        config['base_path'],
        model_type,
        scenario=testing_scenario.scenario.scenario,
        note=config.get('note'))

    as_asl_scheduler_fit.dump(scheduler_filename)

    msg = "Creating a schedule for the test scenario"
    logger.info(msg)
    test_schedule = as_asl_scheduler.create_schedules(
        testing_scenario.scenario)

    msg = "Writing the schedule to disk"
    logger.info(msg)

    t = "asl.{}".format(testing_scenario.scenario.scenario)
    if args.use_random_forests:
        t = "rf.{}".format(testing_scenario.scenario.scenario)

    schedule_filename = filenames.get_schedule_filename(
        config['base_path'], t, note=config.get('note'))

    with open(schedule_filename, "w") as fp:
        json.dump(test_schedule, fp=fp, indent=2)
Exemple #36
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs all of the processing necessary to produce the "
        "signals used for later processing. In particular, it runs the standard "
        "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with "
        "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes "
        "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq "
        "signal for the most-expressed transcripts.")
    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (json) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--mem',
                        help="The amount of RAM to request",
                        default=default_mem)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--tmp',
                        help="The location for temp files",
                        default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    star_index = filenames.get_star_index(config['genome_base_path'],
                                          config['genome_name'],
                                          is_merged=False)

    models_base = config.get('models_base', default_models_base)

    # the first step is the standard riboseq preprocessing

    # handle do_not_call so that we _do_ call the preprocessing script,
    # but that it does not run anything
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = (
        "create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
        .format(riboseq_raw_data, args.config, args.name, args.num_cpus,
                do_not_call_argument, overwrite_argument, logging_str,
                star_str, tmp_str, flexbar_format_option_str,
                keep_intermediate_str, mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available,
    # we still want to call the base pipeline.
    #in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # create the metagene profiles
    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    seqids_to_keep_str = utils.get_config_argument(config, 'seqids_to_keep')
    start_upstream_str = utils.get_config_argument(
        config, 'metagene_profile_start_upstream', 'start-upstream')
    start_downstream_str = utils.get_config_argument(
        config, 'metagene_profile_start_downstream', 'start-downstream')
    end_upstream_str = utils.get_config_argument(
        config, 'metagene_profile_end_upstream', 'end-upstream')
    end_downstream_str = utils.get_config_argument(
        config, 'metagene_profile_end_downstream', 'end-downstream')

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {} {}"
           .format(riboseq_bam_filename, transcript_bed, metagene_profiles,
                   args.num_cpus, logging_str, seqids_to_keep_str,
                   start_upstream_str, start_downstream_str, end_upstream_str,
                   end_downstream_str))

    in_files = [riboseq_bam_filename, orfs_genomic]
    out_files = [metagene_profiles]
    file_checkers = {metagene_profiles: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], args.name, is_unique=is_unique, note=note)

    #periodic_models_str = utils.get_config_argument(config, 'periodic_models')
    #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models')
    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')

    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(
        non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(
        config, 'periodic_offset_start')
    periodic_offset_end_str = utils.get_config_argument(
        config, 'periodic_offset_end')
    metagene_profile_length_str = utils.get_config_argument(
        config, 'metagene_profile_length')
    seed_str = utils.get_config_argument(config, 'seed')
    chains_str = utils.get_config_argument(config, 'chains')
    iterations_str = utils.get_config_argument(config,
                                               'metagene_profile_iterations',
                                               'iterations')

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(
               metagene_profiles, metagene_profile_bayes_factors,
               args.num_cpus, periodic_models_str, non_periodic_models_str,
               periodic_offset_start_str, periodic_offset_end_str,
               metagene_profile_length_str, seed_str, chains_str,
               iterations_str, logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {metagene_profile_bayes_factors: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(
        metagene_profile_bayes_factors, periodic_offsets)
    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {periodic_offsets: utils.check_gzip_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
        config, args.name, args.do_not_call, is_unique=is_unique)

    if len(lengths) == 0:
        msg = (
            "No periodic read lengths and offsets were found. Try relaxing "
            "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
            "and/or min_metagene_bf_likelihood. Qutting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')

    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = (
        "extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} "
        .format(unique_filename, orfs_genomic, exons_file, profiles_filename,
                lengths_str, offsets_str, logging_str, seqname_prefix_str,
                args.num_cpus))
    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    #todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)
Exemple #37
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
Exemple #38
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script uses the mygene.info service to find annotations "
        "for the transcripts associated with the ORFs in the given bed file. In "
        "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, "
        "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.")

    parser.add_argument('bed', help="The bed file")
    parser.add_argument('out', help="The output file. Its type will be inferred "
        "from its extension.")

    parser.add_argument('--do-not-trim', help="By default, the script will "
        "attempt to trim transcript identifiers such that they are valid Ensembl "
        "identifiers. If this flag is given, no trimming will take place.",
        action='store_true')

    parser.add_argument('--scopes', help="A list of scopes to use when querying "
        "mygene.info. Please see the documentation for more information about "
        "valid scopes: http://mygene.info/doc/query_service.html#available_fields",
        nargs='*', default=default_scopes)

    parser.add_argument('--do-not-convert-ids', help="By default, the script will "
        "treat the identifiers in the file as transcript identifiers. It first "
        "maps those to gene identifiers, and then it uses those to find the "
        "gene annotations. If the identifiers are already gene ids (or whatever "
        "is specified by scopes), then the first mapping is not necessary and "
        "can be skipped using this flag.", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    convert_ids = not args.do_not_convert_ids

    msg = "Reading the bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)
    bed = bed[fields_to_keep]

    msg = "Extracting transcript ids"
    logger.info(msg)
    trim = not args.do_not_trim
    orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim)
    orf_ids_df = pd.DataFrame(orf_ids)

    if convert_ids:
        msg = "Querying transcript to gene id mapping"
        logger.info(msg)
        gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id'])
    else:
        gene_ids = pd.DataFrame()
        gene_ids['transcript_id'] = orf_ids_df['transcript_id']
        gene_ids['gene_id'] = orf_ids_df['transcript_id']

    msg = "Querying gene annotations"
    logger.info(msg)
    res_df = mygene_utils.query_mygene(gene_ids['gene_id'])

    msg = "Combining gene annotations with transcript ids"
    logger.info(msg)
    res_df = gene_ids.merge(res_df, on='gene_id', how='inner')

    msg = "Combining transcript annotations with ORF ids"
    logger.info(msg)
    orf_ids_fields = ['transcript_id', 'orf_id']
    res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner')

    msg = "Combining ORF annotations with ORF predictions"
    logger.info(msg)
    res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left')

    msg = "Writing ORF annotations to disk"
    logger.info(msg)
    utils.write_df(res_df, args.out, index=False)
Exemple #39
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script labels the ORFs found with extract-orf-coordinates "
        "based on their exon structure and relation to the annotated, canonical "
        "ORFs. It requires the exon blocks for the ORFs (created with "
        "split-bed12-blocks). It completely reads in the ORFs, so unless otherwise "
        "desired for some reason, the input and output files can be the same.")

    parser.add_argument('annotated_transcripts', help="The annotated transcripts "
        "for the genome, in bed12+ format")
    parser.add_argument('extracted_orfs', help="The ORFs extracted from the "
        "transcripts, in bed12+ format")
    parser.add_argument('orf_exons', help="The exon blocks for the ORFs, in "
        "bed6+ format")

    parser.add_argument('out', help="The output (bed12+.gz) file")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use for "
        "a few parts of the script", type=int, default=default_num_cpus)

    parser.add_argument('-f', '--filter', help="If this flag is given, then ORFs "
        "which are completely covered by an annotated transcript are discarded. "
        "Presumably, this is used to filter uninteresting ORFs from de novo "
        "assemblies.", action='store_true')

    parser.add_argument('-e', '--annotated-exons', help="If the --filter flag is "
        "given, the annotated transcript exons can optionally be provided with "
        "this option. If they are not given, they will be split from the annotated "
        "transcripts. That is generally not a very expensive operation relative to "
        "everything else in the labeling script. If --filter is not given, then "
        "these are ignored.", default=default_annotated_exons)

    parser.add_argument('-n', '--nonoverlapping-label', help="If this option is "
        "given, then ORFs which do not overlap the annotated transcripts at all "
        "will be given this label. Otherwise, they will be labeled as \"suspect\"",
        default=default_nonoverlapping_label)

    parser.add_argument('-l', '--label-prefix', help="This string is prepended "
        "to all labels assigned to ORFs. For example, it is a useful way to "
        "indicate ORFs from de novo assemblies are \"novel.\" In any case, this "
        "*is not* prepended to \"canonical\" ORFs.", default=default_label_prefix)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading annotated transcripts"
    logger.info(msg)
    annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts)

    msg = "Reading extracted ORFs and exons"
    logger.info(msg)
    extracted_orfs = bed_utils.read_bed(args.extracted_orfs)
    extracted_orf_exons = bed_utils.read_bed(args.orf_exons)

    msg = "Found {} extracted ORFs with {} exons".format(len(extracted_orfs),
        len(extracted_orf_exons))
    logger.debug(msg)

    # check if we want to remove the extracted_orfs completely covered by
    # the annotated transcripts
    if args.filter:
        msg = ("Removing extracted ORFs which are completely covered by the "
            "annotated transcripts")
        logger.info(msg)
        
        # we need the annotated transcript exons
        if args.annotated_exons is None:
            msg = "Splitting the annotated transcripts into exon blocks"
            logger.info(msg)

            annotated_exons = bed_utils.split_bed12(annotated_transcripts, 
                                    num_cpus=args.num_cpus, progress_bar=True)
        else:
            msg = "Reading the annotated transcript exons"
            logger.info(msg)

            annotated_exons = bed_utils.read_bed(args.annotated_exons)

        msg = "Finding completely covered extracted ORFs"
        logger.info(msg)

        nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons, annotated_exons, 
                                                    min_a_overlap=1)

        m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs = extracted_orfs[m_unfiltered]

        # also discard the unnecessary exons
        m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[m_unfiltered]

        msg = "After filtering, {} extracted ORFs remain".format(len(extracted_orfs))
        logger.info(msg)

    
    # if the nonoverlapping-label is given, annotate and remove the ORFs
    # which do not at all overlap the annotations
    if args.nonoverlapping_label is not None:
        
        nonoverlapping_ids = bed_utils.subtract_bed(
                    extracted_orfs, 
                    annotated_transcripts, 
                    exons_a=extracted_orf_exons,
                    exons_b=annotated_exons)

        m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping]

        m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs.loc[m_nonoverlapping, 'orf_type'] = args.nonoverlapping_label

        msg = ("Found {} ORFs completely nonoverlapping annotated transcripts".
            format(len(nonoverlapping_ids)))
        logger.info(msg)

    msg = "Removing the annotated UTRs from the transcripts"
    logger.info(msg)
    canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts, 
        num_cpus=args.num_cpus)

    msg = "Splitting the canonical ORFs into exons"
    logger.info(msg)
    canonical_orf_exons = bed_utils.split_bed12(canonical_orfs, 
        num_cpus=args.num_cpus, progress_bar=True)

    msg = "Extracting annotated 5' leader regions"
    logger.info(msg)
    five_prime_regions = bed_utils.retain_all_five_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(five_prime_regions) == 0:
        msg = "No annotated 5' leader regions were found"
        logger.warning(msg)

    msg = "Splitting the 5' leaders into exons"
    logger.info(msg)
    five_prime_exons = bed_utils.split_bed12(five_prime_regions, 
        num_cpus=args.num_cpus, progress_bar=True)

    msg = "Extracting annotated 3' trailer regions"
    logger.info(msg)
    three_prime_regions = bed_utils.retain_all_three_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    
    if len(three_prime_regions) == 0:
        msg = "No annotated 3' trailer regions were found"
        logger.warning(msg)


    msg = "Splitting the 3' trailers into exons"
    logger.info(msg)
    three_prime_exons = bed_utils.split_bed12(three_prime_regions, 
        num_cpus=args.num_cpus, progress_bar=True)

    msg = "Splitting noncoding transcripts into exons"
    logger.info(msg)

    m_no_thick_start = annotated_transcripts['thick_start'] == -1
    m_no_thick_end = annotated_transcripts['thick_end'] == -1
    m_no_thick = m_no_thick_start & m_no_thick_end
    noncoding_transcripts = annotated_transcripts[m_no_thick]

    noncoding_exons = bed_utils.split_bed12(noncoding_transcripts, 
                                    num_cpus=args.num_cpus, progress_bar=True)

    msg = "Marking canonical and extracted ORFs with the same stop codon"
    logger.info(msg)

    # first, add the true ORF end
    m_forward_canonical = canonical_orfs['strand'] == '+'
    m_reverse_canonical = canonical_orfs['strand'] == '-'

    m_forward_extracted = extracted_orfs['strand'] == '+'
    m_reverse_extracted = extracted_orfs['strand'] == '-'

    canonical_orfs['orf_end'] = canonical_orfs['end']
    canonical_orfs.loc[m_reverse_canonical, 'orf_end'] = canonical_orfs.loc[m_reverse_canonical, 'start']

    extracted_orfs['orf_end'] = extracted_orfs['end']
    extracted_orfs.loc[m_reverse_extracted, 'orf_end'] = extracted_orfs.loc[m_reverse_extracted, 'start']

    # now, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs
    merge_fields = ['seqname', 'strand', 'orf_end']
    canonical_extracted_orf_ends = canonical_orfs.merge(extracted_orfs, 
                        on=merge_fields, suffixes=['_canonical', '_extracted'])

    # now, pull this into a set
    zip_it =  zip(  canonical_extracted_orf_ends['id_canonical'], 
                    canonical_extracted_orf_ends['id_extracted'])
    canonical_extracted_matching_ends = { (c,a) for c,a in zip_it }

    msg = "Finding ORFs which exactly overlap the canonical ORFs"
    logger.info(msg)

    exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, 
                        extracted_orf_exons, min_a_overlap=1, min_b_overlap=1)

    exact_match_orf_ids = {o.b_info for o in exact_matches}
    m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids)
    extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches]

    m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids)
    extracted_orfs.loc[m_canonical, 'orf_type'] = 'canonical'

    msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids))
    logger.info(msg)

    msg = "Finding ORFs which are extended versions of the canonical ORFs"
    logger.info(msg)

    extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, 
                                        extracted_orf_exons, min_a_overlap=1)

    # make sure the "end"s match before calling something an extended match
    extended_match_ids = {m.b_info for m in tqdm.tqdm(extended_matches) 
                            if (m.a_info, m.b_info) in canonical_extracted_matching_ends}

    m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_extended_matches]

    m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids)

    l = "{}canonical_extended".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_extended, 'orf_type'] = l

    msg = "Found {} canonical_extended ORFs".format(len(extended_match_ids))
    logger.info(msg)

    msg = "Finding ORFs which are truncated versions of the canonical ORFs"
    logger.info(msg)

    truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, 
                                        extracted_orf_exons, min_b_overlap=1)

    # make sure the "end"s match before calling something a truncated match
    truncated_match_ids = {m.b_info for m in tqdm.tqdm(truncated_matches) 
                            if (m.a_info, m.b_info) in canonical_extracted_matching_ends}

    m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_truncated_matches]

    m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids)
    
    l = "{}canonical_truncated".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_truncated, 'orf_type'] = l

    msg = "Found {} canonical_truncated ORFs".format(len(truncated_match_ids))
    logger.info(msg)

    msg = ("Labeling ORFs which are completely covered by a canonical ORF but "
                    "do not share its stop codon")
    logger.info(msg)

    # anything in "truncated matches" which *does not* share a stop codon with 
    # the match is a "within" orf
    within_ids = {m.b_info for m in truncated_matches if m.b_info not in truncated_match_ids}

    m_within_matches = extracted_orf_exons['id'].isin(within_ids)
    extracted_orf_exons = extracted_orf_exons[~m_within_matches]

    m_within = extracted_orfs['id'].isin(within_ids)
    
    l = "{}within".format(args.label_prefix)
    extracted_orfs.loc[m_within, 'orf_type'] = l

    msg = "Found {} within ORFs".format(len(within_ids))
    logger.info(msg)
        
    msg = "Finding out-of-frame overlaps"
    logger.info(msg)
    out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, 
                                                        extracted_orf_exons)

    msg = "Finding leader overlaps"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, 
                                                    extracted_orf_exons)

    msg = "Finding trailer overlaps"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, 
                                                    extracted_orf_exons)

    msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a "
        "canonical ORF and annotated leaders or trailers")
    logger.info(msg)

    out_of_frame_ids = {m.b_info for m in out_of_frame_matches}
    leader_ids = {m.b_info for m in leader_matches}
    trailer_ids = {m.b_info for m in trailer_matches}

    leader_overlap_ids = out_of_frame_ids & leader_ids
    trailer_overlap_ids = out_of_frame_ids & trailer_ids

    m_leader_overlap_matches = extracted_orf_exons['id'].isin(leader_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches]

    m_trailer_overlap_matches = extracted_orf_exons['id'].isin(trailer_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches]

    m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids)
    
    l = "{}five_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = l

    m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids)
    
    l = "{}three_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = l

    msg = "Found {} five_prime_overlap ORFs".format(len(leader_overlap_ids))
    logger.info(msg)

    msg = "Found {} three_prime_overlap ORFs".format(len(trailer_overlap_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 5' leaders"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, 
                                        extracted_orf_exons, min_b_overlap=1)
    leader_ids = {m.b_info for m in leader_matches}

    m_leader_matches = extracted_orf_exons['id'].isin(leader_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_matches]

    m_five_prime = extracted_orfs['id'].isin(leader_ids)
    
    l = "{}five_prime".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime, 'orf_type'] = l

    msg = "Found {} five_prime ORFs".format(len(leader_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 3' trailers"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, 
                                        extracted_orf_exons, min_b_overlap=1)
    trailer_ids = {m.b_info for m in trailer_matches}

    m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_matches]

    m_three_prime = extracted_orfs['id'].isin(trailer_ids)
    
    l = "{}three_prime".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime, 'orf_type'] = l

    msg = "Found {} three_prime ORFs".format(len(trailer_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within annotated, noncoding transcripts"
    logger.info(msg)

    noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons, 
                                        extracted_orf_exons, min_b_overlap=1)

    noncoding_ids = {m.b_info for m in noncoding_matches}

    m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids)
    extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches]

    m_noncoding = extracted_orfs['id'].isin(noncoding_ids)
    
    l = "{}noncoding".format(args.label_prefix)
    extracted_orfs.loc[m_noncoding, 'orf_type'] = l

    msg = "Found {} noncoding ORFs".format(len(noncoding_ids))
    logger.info(msg)

    # all of the remaining ORFs fall into the "suspect" category
    suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']}

    m_suspect = extracted_orfs['id'].isin(suspect_ids)
    
    l = "{}suspect".format(args.label_prefix)
    extracted_orfs.loc[m_suspect, 'orf_type'] = l

    msg = "Found {} \"suspect\" ORFs".format(len(suspect_ids))
    logger.info(msg)

    m_no_orf_type = extracted_orfs['orf_type'].isnull()

    msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type))
    logger.info(msg)

    msg = "Writing ORFs with types to disk"
    logger.info(msg)

    fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
    extracted_orfs = extracted_orfs[fields]
    extracted_orfs = bed_utils.sort(extracted_orfs)

    bed_utils.write_bed(extracted_orfs, args.out)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script estimates the Bayes factors for all metagene profiles in the "
        "given file. The script accepts as input multiple \"periodic\" and \"nonperiodic\" "
        "models. It uses the models of each type with the best mean to estimate the Bayes "
        "factor distributions.\n\nIt contains some hard-coded field names.")

    parser.add_argument('metagene_profiles',
                        help="The (csv) file containing the metagene profiles")
    parser.add_argument('out', help="The output (csv.gz) file")

    parser.add_argument(
        '--periodic-models',
        help="A list of pickled StanModel files which contain "
        "models that somehow represent periodic metagene profiles",
        nargs="+",
        default=default_periodic_models)
    parser.add_argument(
        '--nonperiodic-models',
        help="A list of pickled StanModel files which contain "
        "models that somehow represent nonperiodic metagene profiles",
        nargs="+",
        default=default_nonperiodic_models)

    parser.add_argument(
        '--periodic-offset-start',
        help="The position, relative to the translation "
        "initiation site, to begin calculating periodicity Bayes factors (inclusive)",
        type=int,
        default=default_periodic_offset_start)
    parser.add_argument(
        '--periodic-offset-end',
        help="The position, relative to the translation "
        "initiation site, to stop calculating periodicity Bayes factors (inclusive)",
        type=int,
        default=default_periodic_offset_end)
    parser.add_argument(
        '--metagene-profile-length',
        help="The length of the profile to use in the "
        "models. metagene_profile_length + periodic_offset_end must be consistent with the length "
        "of the extracted metagene profile. The length must be divisible by three.",
        type=int,
        default=default_metagene_profile_length)

    parser.add_argument('-s',
                        '--seed',
                        help="The random seeds to use for inference",
                        type=int,
                        default=default_seed)
    parser.add_argument('-c',
                        '--chains',
                        help="The number of MCMC chains to use",
                        type=int,
                        default=default_chains)
    parser.add_argument('-i',
                        '--iterations',
                        help="The number of MCMC iterations to use for "
                        "each chain",
                        type=int,
                        default=default_iterations)

    parser.add_argument(
        '-p',
        '--num-cpus',
        help="The number of CPUs to use. Each read "
        "length will be processed in its own thread (so that is the maximum number of CPUs "
        "that is useful).",
        type=int,
        default=default_num_cpus)

    parser.add_argument('--type-field', default=default_type_field)
    parser.add_argument('--count-field', default=default_count_field)
    parser.add_argument('--position-field', default=default_position_field)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # we will parallelize based on the lengths. So we need to know which lengths
    # are present in the metagene profiles file
    metagene_profiles = pd.read_csv(args.metagene_profiles)
    lengths = list(metagene_profiles['length'].unique())

    length_str = ','.join(str(int(l)) for l in lengths)
    msg = "Estimating Bayes factors for lengths: {}".format(length_str)
    logger.info(msg)

    length_groups = metagene_profiles.groupby('length')

    with suppress_stdout_stderr():

        all_profile_estimates_df = parallel.apply_parallel_groups(
            length_groups,
            args.num_cpus,
            estimate_profile_bayes_factors,
            args,
            progress_bar=True)

    msg = "Combining estimates into one data frame"
    logger.info(msg)

    all_profile_estimates_df = utils.remove_nones(all_profile_estimates_df)
    all_profile_estimates_df = pd.concat(all_profile_estimates_df)

    pandas_utils.write_df(all_profile_estimates_df, args.out, index=False)
Exemple #41
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out',
                        help="The base output directory for the latex report")

    parser.add_argument(
        '--show-unfiltered-orfs',
        help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included",
        action='store_true')

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument('--uniprot',
                        help="The uniprot ORF lengths, if available",
                        default=default_uniprot)
    parser.add_argument('--uniprot-label',
                        help="The label to use for the uniprot ORFs in "
                        "the plot",
                        default=default_uniprot_label)

    parser.add_argument('--image-type',
                        help="The format of the image files. This must be "
                        "a format usable by matplotlib.",
                        default=default_image_type)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    parser.add_argument(
        '--show-chisq',
        help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.",
        action='store_true')

    parser.add_argument('-t',
                        '--tmp',
                        help="A location for temporary files",
                        default=default_tmp)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = ['riboseq_data', 'riboseq_samples']
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())

        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(
                ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page
            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)

            i = 0
            for sample_name in sample_names:

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        sample_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None

                caption = "ORF type metagene profiles: {}".format(
                    replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        replicate_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.debug(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Collect the individual read length ORF profiles created "
        "by create-read-length-orf-profiles into a single sparse \"tensor\". "
        "N.B. This script is called by create-read-length-orf-profiles, and "
        "it is unlikely that it should ever be called independently.")
    
    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name', help="The name for the dataset, used in the "
        "created files")
    
    parser.add_argument('out', help="The (mtx.gz) output file containing the "
        "ORF profiles and read lengths")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)

    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    # keep a map from the lengths to the combined profiles
    length_profile_map = {}

    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
                        
            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset], 
                is_unique=is_unique, 
                note=note
            )

            mtx = scipy.io.mmread(mtx).tocsr()

            prior_mtx = length_profile_map.get(length, None)

            if prior_mtx is None:
                length_profile_map[length] = mtx
            else:
                length_profile_map[length] = prior_mtx + mtx

    with gzip.open(args.out, 'wb') as target_gz:

        for length, mtx in length_profile_map.items():
            mtx = mtx.tocoo()
                        
            msg = "Writing ORF profiles. length: {}.".format(length)
            logger.info(msg)
            
            for row, col, val in zip(mtx.row, mtx.col, mtx.data):
                s = "{} {} {} {}\n".format(length, row, col, val)
                target_gz.write(s.encode())
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="")  #filenames.run_riboseq_preprocessing_description)

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--mem',
                        help="The amount of RAM to request",
                        default=default_mem)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('-t',
                        '--tmp',
                        help="The location for temporary files. If not "
                        "specified, program-specific temp locations are used.",
                        default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config))
    call = not args.do_not_call
    keep_delete_files = args.keep_intermediate_files or args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'remove-multimapping-reads'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # Step 0: Running flexbar to remove adapter sequences

    raw_data = args.raw_data
    flexbar_target = filenames.get_without_adapters_base(
        config['riboseq_data'], args.name, note=note)
    without_adapters = filenames.get_without_adapters_fastq(
        config['riboseq_data'], args.name, note=note)

    adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence',
                                                'adapter-seq')
    adapter_file_str = utils.get_config_argument(config, 'adapter_file',
                                                 'adapters')

    quality_format_str = utils.get_config_argument(
        config,
        'quality_format',
        args.flexbar_format_option,
        default=default_quality_format)
    max_uncalled_str = utils.get_config_argument(config,
                                                 'max_uncalled',
                                                 default=default_max_uncalled)
    pre_trim_left_str = utils.get_config_argument(
        config, 'pre_trim_left', default=default_pre_trim_left)

    cmd = "flexbar {} {} {} {} -n {} {} -r {} -t {} {}".format(
        quality_format_str, max_uncalled_str, adapter_seq_str,
        adapter_file_str, args.num_cpus, flexbar_compression_str, raw_data,
        flexbar_target, pre_trim_left_str)
    in_files = [raw_data]
    out_files = [without_adapters]
    file_checkers = {without_adapters: fastx_utils.check_fastq_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call)

    # Step 1: Running bowtie2 to remove rRNA alignments
    out = utils.abspath("dev", "null")  # we do not care about the alignments
    without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'],
                                                    args.name,
                                                    note=note)
    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              args.name,
                                              note=note)

    cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format(
        args.num_cpus, config['ribosomal_index'], without_adapters, out,
        without_rrna, with_rrna)
    in_files = [without_adapters]
    in_files.extend(bio.get_bowtie2_index_files(config['ribosomal_index']))
    out_files = [without_rrna, with_rrna]
    to_delete = [without_adapters]
    file_checkers = {without_rrna: fastx_utils.check_fastq_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call,
                                   keep_delete_files=keep_delete_files,
                                   to_delete=to_delete)

    # Step 2: Running STAR to align rRNA-depleted reads to genome
    star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'],
                                                        args.name,
                                                        note=note)
    #transcriptome_bam = "{}{}".format(star_output_prefix, "Aligned.toTranscriptome.out.bam")
    genome_star_bam = "{}{}".format(star_output_prefix,
                                    "Aligned.sortedByCoord.out.bam")

    star_compression_str = "--readFilesCommand {}".format(
        shlex.quote(args.star_read_files_command))

    align_intron_min_str = utils.get_config_argument(
        config,
        'align_intron_min',
        'alignIntronMin',
        default=default_align_intron_min)
    align_intron_max_str = utils.get_config_argument(
        config,
        'align_intron_max',
        'alignIntronMax',
        default=default_align_intron_max)
    out_filter_mismatch_n_max_str = utils.get_config_argument(
        config,
        'out_filter_mismatch_n_max',
        'outFilterMismatchNmax',
        default=default_out_filter_mismatch_n_max)
    out_filter_mismatch_n_over_l_max_str = utils.get_config_argument(
        config,
        'out_filter_mismatch_n_over_l_max',
        'outFilterMismatchNoverLmax',
        default=default_out_filter_mismatch_n_over_l_max)
    out_filter_type_str = utils.get_config_argument(
        config,
        'out_filter_type',
        'outFilterType',
        default=default_out_filter_type)
    out_filter_intron_motifs_str = utils.get_config_argument(
        config,
        'out_filter_intron_motifs',
        'outFilterIntronMotifs',
        default=default_out_filter_intron_motifs)
    out_sam_attributes_str = utils.get_config_argument(
        config,
        'out_sam_attributes',
        'outSAMattributes',
        default=default_out_sam_attributes)

    star_tmp_str = ""
    if args.tmp is not None:
        star_tmp_name = "STAR_rpbp"
        star_tmp_dir = star_utils.create_star_tmp(args.tmp, star_tmp_name)
        star_tmp_str = "--outTmpDir {}".format(star_tmp_dir)

    mem_bytes = utils.human2bytes(args.mem)
    star_mem_str = "--limitBAMsortRAM {}".format(mem_bytes)

    cmd = (
        "{} --runThreadN {} {} --genomeDir {} --sjdbGTFfile {} --readFilesIn {} "
        "{} {} {} {} {} {} {} {} --outFileNamePrefix {} {} {} {}".format(
            args.star_executable, args.num_cpus, star_compression_str,
            config['star_index'], config['gtf'], without_rrna,
            align_intron_min_str, align_intron_max_str,
            out_filter_mismatch_n_max_str, out_filter_type_str,
            out_filter_intron_motifs_str, quant_mode_str,
            out_filter_mismatch_n_over_l_max_str, out_sam_attributes_str,
            star_output_prefix, star_out_str, star_tmp_str, star_mem_str))
    in_files = [without_rrna]
    in_files.extend(star_utils.get_star_index_files(config['star_index']))
    #out_files = [transcriptome_bam, genome_star_bam]
    to_delete = [without_rrna]
    out_files = [genome_star_bam]
    file_checkers = {
        #transcriptome_bam: bam_utils.check_bam_file,
        genome_star_bam: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call,
                                   keep_delete_files=keep_delete_files,
                                   to_delete=to_delete)

    # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline
    genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                                  args.name,
                                                  note=note)

    if os.path.exists(genome_star_bam):
        utils.create_symlink(genome_star_bam, genome_sorted_bam, call)
    else:
        msg = ("Could not find the STAR genome bam alignment file. Unless "
               "--do-not-call was given, this is a problem.")
        logger.warning(msg)

    # create the bamtools index
    cmd = "samtools index -b {}".format(genome_sorted_bam)
    shell_utils.check_call(cmd, call=call)

    # check if we want to keep multimappers
    if 'keep_riboseq_multimappers' in config:
        return

    # remove multimapping reads from the genome file
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                       args.name,
                                                       is_unique=True,
                                                       note=note)

    cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam,
                                                      unique_genome_filename,
                                                      tmp_str)

    in_files = [genome_sorted_bam]
    out_files = [unique_genome_filename]
    to_delete = [genome_star_bam, genome_sorted_bam]
    file_checkers = {unique_genome_filename: bam_utils.check_bam_file}
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite,
                                   call=call,
                                   keep_delete_files=keep_delete_files,
                                   to_delete=to_delete)
Exemple #44
0
def main():

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script extracts the metagene profile from reads in a BAM "
        "file, possibly filtering by length. It attempts to vectorize as many of the "
        "counting operations as possible.")
    parser.add_argument('bam', help="The bam file")
    parser.add_argument('orfs', help="The annotated transcripts (bed) file")
    parser.add_argument('out', help="The (output) csv.gz counts file")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
        type=int, default=default_num_cpus)

    parser.add_argument('--is-sam', help="If this flag is present, the alignment file will "
        "be parsed as SAM rather than BAM", action='store_true')

    parser.add_argument('--lengths', help="If specified, then metagene profiles will be "
        "created for reads of each length. Otherwise, profiles will be created for each "
        "read length present in the bam file.", type=int, nargs='*', default=default_lengths)

    parser.add_argument('--start-upstream', type=int, default=default_start_upstream, 
        help="The number of bases upstream of the translation initiation site to begin "
        "constructing the metagene profile.")
    parser.add_argument('--start-downstream', type=int, default=default_start_downstream,
        help="The number of bases downstream of the translation initiation site to end "
        "the metagene profile.")
    parser.add_argument('--end-upstream', type=int, default=default_end_upstream,
        help="The number of bases upstream of the translation termination site to begin "
        "constructing the metagene profile.")
    parser.add_argument('--end-downstream', type=int, default=default_end_downstream,
        help="The number of bases downstream of the translation termination site to end "
        "the metagene profile.")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[extract-metagene-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    # first, get the 5' ends of the reads
    alignment_df = bam_utils.get_five_prime_ends(args.bam, progress_bar=True, 
        count=True, logger=logger)

    msg = "Reading annotations"
    logger.info(msg)
    annotations_df = bed_utils.read_bed(args.orfs)

    msg = "Constructing canonical translation initiation ORF data frames"
    logger.info(msg)

    m_has_canonical = annotations_df['thick_start'] > -1
    m_forward = annotations_df['strand'] == '+'

    m_canonical_forward = m_has_canonical & m_forward
    m_canonical_reverse = m_has_canonical & ~m_forward

    # forward translation initiation
    start = annotations_df.loc[m_canonical_forward, 'thick_start'] - args.start_upstream
    end = annotations_df.loc[m_canonical_forward, 'thick_start'] + args.start_downstream
    seqname = annotations_df.loc[m_canonical_forward, 'seqname']
    strand = '+'

    intervals_forward_initiation_bed = get_interval_df(start, end, seqname, strand)

    # reverse translation initation
    start = annotations_df.loc[m_canonical_reverse, 'thick_end'] - args.start_downstream
    end = annotations_df.loc[m_canonical_reverse, 'thick_end'] + args.start_upstream
    seqname = annotations_df.loc[m_canonical_reverse, 'seqname']
    strand = '-'

    intervals_reverse_initiation_bed = get_interval_df(start, end, seqname, strand)

    # all translation initiation regions
    intervals_initiation_bed = pd.concat([intervals_forward_initiation_bed, intervals_reverse_initiation_bed])

    # make sure we do not double count isoforms with the same starts
    intervals_initiation_bed = intervals_initiation_bed.drop_duplicates()

    msg = "Constructing canonical translation termination ORF data frames"
    logger.info(msg)
    
    # forward translation termination
    start = annotations_df.loc[m_canonical_forward, 'thick_end'] - args.end_upstream
    end = annotations_df.loc[m_canonical_forward, 'thick_end'] + args.end_downstream
    seqname = annotations_df.loc[m_canonical_forward, 'seqname']
    strand = '+'
    
    intervals_forward_termination_bed = get_interval_df(start, end, seqname, strand)

    # reverse translation termination
    start = annotations_df.loc[m_canonical_reverse, 'thick_start'] - args.end_downstream
    end = annotations_df.loc[m_canonical_reverse, 'thick_start'] + args.end_upstream
    seqname = annotations_df.loc[m_canonical_reverse, 'seqname']
    strand = '-'
    intervals_reverse_termination_bed = get_interval_df(start, end, seqname, strand)

    # all translation termination regions
    intervals_termination_bed = pd.concat([intervals_forward_termination_bed, intervals_reverse_termination_bed])

    # make sure we do not double count isoforms with the same starts
    intervals_termination_bed = intervals_termination_bed.drop_duplicates()

    msg = "Finding translation initiation site matches"
    logger.info(msg)

    initiation_matches = bed_utils.get_all_position_intersections(alignment_df, intervals_initiation_bed)
    profile_length = args.start_upstream + args.start_downstream + 1
    initiation_length_strand_profiles = get_length_strand_profiles(initiation_matches, profile_length)

    initiation_keys_str = ','.join(str(k) for k in initiation_length_strand_profiles.keys())
    msg = "Initiation keys: {}".format(initiation_keys_str)
    logger.debug(msg)

    msg = "Finding translation termination site matches"
    logger.info(msg)

    termination_matches = bed_utils.get_all_position_intersections(alignment_df, intervals_termination_bed)
    profile_length = args.end_upstream + args.end_downstream + 1
    termination_length_strand_profiles = get_length_strand_profiles(termination_matches, profile_length)

    termination_keys_str = ','.join(str(k) for k in termination_length_strand_profiles.keys())
    msg = "Termination keys: {}".format(termination_keys_str)
    logger.debug(msg)

    msg = "Extracting metagene profiles"
    logger.info(msg)

    
    if len(args.lengths) == 0:
        args.lengths = list(alignment_df['length'].unique())

    args.lengths = np.sort(args.lengths)
    args.lengths = [int(l) for l in args.lengths]
    length_str = ','.join(str(l) for l in args.lengths)
    msg = "Profiles will be created for lengths: {}".format(length_str)
    logger.info(msg)

    all_metagene_profile_dfs = []

    for length in args.lengths:
        # first, the profile for this length around initiation sites
        initiation_profile_df = get_metagene_profile_df(length, 
                                                        'start',
                                                        initiation_length_strand_profiles, 
                                                        args.start_upstream, 
                                                        args.start_downstream)
        
        all_metagene_profile_dfs.append(initiation_profile_df)
        
        # and around termination sites
        termination_profile_df = get_metagene_profile_df(length, 
                                                        'end',
                                                        termination_length_strand_profiles, 
                                                        args.end_upstream, 
                                                        args.end_downstream)
        
        all_metagene_profile_dfs.append(termination_profile_df)
        
    # filter out all of the profiles which did not have any reads
    all_metagene_profile_dfs = [df for df in all_metagene_profile_dfs if df is not None]

    # join them together in one large data frame
    all_metagene_profile_dfs = pd.concat(all_metagene_profile_dfs)

    msg = "Writing metagene profiles to disk"
    logger.info(msg)
    pandas_utils.write_df(all_metagene_profile_dfs, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument('-a', '--append-sheet', help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)", action='store_true')

    parser.add_argument('-f', '--filter', help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument('--read-filter-percent', help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.", type=float, 
        default=default_read_filter_percent)

        
    parser.add_argument('--kl-filter-percent', help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.", type=float, 
        default=default_kl_filter_percent)

    parser.add_argument('--id-matches', help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.", nargs='*', default=default_id_matches)

    parser.add_argument('--id-match-names', help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.", nargs='*', default=default_id_match_names)

    parser.add_argument('--overlaps', help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.", nargs='*', default=default_overlaps)

    parser.add_argument('--overlap-names', help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.", nargs='*', default=default_overlap_names)

    parser.add_argument('-r', '--ensembl-release', help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers", 
        type=int, default=default_ensembl_release)

    parser.add_argument('-s', '--ensembl-species', help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers", 
        default=default_ensembl_species)

    parser.add_argument('--a-is-single-sample', help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.", action='store_true')

    parser.add_argument('--b-is-single-sample', help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.", action='store_true')

    parser.add_argument('--fields-to-keep', help="The fields to keep from the "
        "Bayes factor file for each condition", nargs='*', 
        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) "
        "length of ORFs considered as micropeptides", type=int, 
        default=default_max_micropeptide_len)

    parser.add_argument('--do-not-fix-tcons', help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, 
        species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
            "match. {} files and {} names".format(len(args.id_matches), 
            len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
            "match. {} files and {} names".format(len(args.overlaps), 
            len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent, 0, 1, 
            variable_name="--read-filter-percent")
            
        math_utils.check_range(args.kl_filter_percent, 0, 1, 
            variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(config, 
                args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name_a, 
        length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
            format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], 
        args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations,
        is_filtered=True, is_chisq=False)
    
    if not os.path.exists(predicted_orfs_a):
        msg = ("Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)


    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(config, 
                args.name_b, is_unique=is_unique)
        
    bayes_factors_b = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name_b, 
        length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
            format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], 
        args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations,
        is_filtered=True, is_chisq=False)
    
    if not os.path.exists(predicted_orfs_b):
        msg = ("Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'],
        note=config.get('orf_note'), is_orf=True)

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(micropeptides_a, 
        micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 
        'micro_a_long_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)


    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 
        'long_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], 
        left_on='A', right_on='id', how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, 
        species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 
        'A', ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 
        'B', ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', 
        how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b, left_on='gene_id_B', 
        right_on='gene_id', how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)
        
    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', 
            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank
        
        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]


    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        pandas_utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
Exemple #46
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script labels the ORFs found with extract-orf-coordinates "
        "based on their exon structure and relation to the annotated, canonical "
        "ORFs. It requires the exon blocks for the ORFs (created with "
        "split-bed12-blocks). It completely reads in the ORFs, so unless otherwise "
        "desired for some reason, the input and output files can be the same.")

    parser.add_argument('annotated_transcripts',
                        help="The annotated transcripts "
                        "for the genome, in bed12+ format")
    parser.add_argument('extracted_orfs',
                        help="The ORFs extracted from the "
                        "transcripts, in bed12+ format")
    parser.add_argument('orf_exons',
                        help="The exon blocks for the ORFs, in "
                        "bed6+ format")

    parser.add_argument('out', help="The output (bed12+.gz) file")

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use for "
                        "a few parts of the script",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is given, then ORFs "
        "which are completely covered by an annotated transcript are discarded. "
        "Presumably, this is used to filter uninteresting ORFs from de novo "
        "assemblies.",
        action='store_true')

    parser.add_argument(
        '-e',
        '--annotated-exons',
        help="If the --filter flag is "
        "given, the annotated transcript exons can optionally be provided with "
        "this option. If they are not given, they will be split from the annotated "
        "transcripts. That is generally not a very expensive operation relative to "
        "everything else in the labeling script. If --filter is not given, then "
        "these are ignored.",
        default=default_annotated_exons)

    parser.add_argument(
        '-n',
        '--nonoverlapping-label',
        help="If this option is "
        "given, then ORFs which do not overlap the annotated transcripts at all "
        "will be given this label. Otherwise, they will be labeled as \"suspect\"",
        default=default_nonoverlapping_label)

    parser.add_argument(
        '-l',
        '--label-prefix',
        help="This string is prepended "
        "to all labels assigned to ORFs. For example, it is a useful way to "
        "indicate ORFs from de novo assemblies are \"novel.\" In any case, this "
        "*is not* prepended to \"canonical\" ORFs.",
        default=default_label_prefix)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading annotated transcripts"
    logger.info(msg)
    annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts)

    msg = "Reading extracted ORFs and exons"
    logger.info(msg)
    extracted_orfs = bed_utils.read_bed(args.extracted_orfs)
    extracted_orf_exons = bed_utils.read_bed(args.orf_exons)

    msg = "Found {} extracted ORFs with {} exons".format(
        len(extracted_orfs), len(extracted_orf_exons))
    logger.debug(msg)

    # check if we want to remove the extracted_orfs completely covered by
    # the annotated transcripts
    if args.filter:
        msg = ("Removing extracted ORFs which are completely covered by the "
               "annotated transcripts")
        logger.info(msg)

        # we need the annotated transcript exons
        if args.annotated_exons is None:
            msg = "Splitting the annotated transcripts into exon blocks"
            logger.info(msg)

            annotated_exons = bed_utils.split_bed12(annotated_transcripts,
                                                    num_cpus=args.num_cpus,
                                                    progress_bar=True)
        else:
            msg = "Reading the annotated transcript exons"
            logger.info(msg)

            annotated_exons = bed_utils.read_bed(args.annotated_exons)

        msg = "Finding completely covered extracted ORFs"
        logger.info(msg)

        nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons,
                                                    annotated_exons,
                                                    min_a_overlap=1)

        m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs = extracted_orfs[m_unfiltered]

        # also discard the unnecessary exons
        m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[m_unfiltered]

        msg = "After filtering, {} extracted ORFs remain".format(
            len(extracted_orfs))
        logger.info(msg)

    # if the nonoverlapping-label is given, annotate and remove the ORFs
    # which do not at all overlap the annotations
    if args.nonoverlapping_label is not None:

        nonoverlapping_ids = bed_utils.subtract_bed(
            extracted_orfs,
            annotated_transcripts,
            exons_a=extracted_orf_exons,
            exons_b=annotated_exons)

        m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping]

        m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs.loc[m_nonoverlapping,
                           'orf_type'] = args.nonoverlapping_label

        msg = ("Found {} ORFs completely nonoverlapping annotated transcripts".
               format(len(nonoverlapping_ids)))
        logger.info(msg)

    msg = "Removing the annotated UTRs from the transcripts"
    logger.info(msg)
    canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts,
                                                     num_cpus=args.num_cpus)

    msg = "Splitting the canonical ORFs into exons"
    logger.info(msg)
    canonical_orf_exons = bed_utils.split_bed12(canonical_orfs,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)

    msg = "Extracting annotated 5' leader regions"
    logger.info(msg)
    five_prime_regions = bed_utils.retain_all_five_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(five_prime_regions) == 0:
        msg = "No annotated 5' leader regions were found"
        logger.warning(msg)

    msg = "Splitting the 5' leaders into exons"
    logger.info(msg)
    five_prime_exons = bed_utils.split_bed12(five_prime_regions,
                                             num_cpus=args.num_cpus,
                                             progress_bar=True)

    msg = "Extracting annotated 3' trailer regions"
    logger.info(msg)
    three_prime_regions = bed_utils.retain_all_three_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(three_prime_regions) == 0:
        msg = "No annotated 3' trailer regions were found"
        logger.warning(msg)

    msg = "Splitting the 3' trailers into exons"
    logger.info(msg)
    three_prime_exons = bed_utils.split_bed12(three_prime_regions,
                                              num_cpus=args.num_cpus,
                                              progress_bar=True)

    msg = "Splitting noncoding transcripts into exons"
    logger.info(msg)

    m_no_thick_start = annotated_transcripts['thick_start'] == -1
    m_no_thick_end = annotated_transcripts['thick_end'] == -1
    m_no_thick = m_no_thick_start & m_no_thick_end
    noncoding_transcripts = annotated_transcripts[m_no_thick]

    noncoding_exons = bed_utils.split_bed12(noncoding_transcripts,
                                            num_cpus=args.num_cpus,
                                            progress_bar=True)

    msg = "Marking canonical and extracted ORFs with the same stop codon"
    logger.info(msg)

    # first, add the true ORF end
    m_forward_canonical = canonical_orfs['strand'] == '+'
    m_reverse_canonical = canonical_orfs['strand'] == '-'

    m_forward_extracted = extracted_orfs['strand'] == '+'
    m_reverse_extracted = extracted_orfs['strand'] == '-'

    canonical_orfs['orf_end'] = canonical_orfs['end']
    canonical_orfs.loc[m_reverse_canonical,
                       'orf_end'] = canonical_orfs.loc[m_reverse_canonical,
                                                       'start']

    extracted_orfs['orf_end'] = extracted_orfs['end']
    extracted_orfs.loc[m_reverse_extracted,
                       'orf_end'] = extracted_orfs.loc[m_reverse_extracted,
                                                       'start']

    # now, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs
    merge_fields = ['seqname', 'strand', 'orf_end']
    canonical_extracted_orf_ends = canonical_orfs.merge(
        extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted'])

    # now, pull this into a set
    zip_it = zip(canonical_extracted_orf_ends['id_canonical'],
                 canonical_extracted_orf_ends['id_extracted'])
    canonical_extracted_matching_ends = {(c, a) for c, a in zip_it}

    msg = "Finding ORFs which exactly overlap the canonical ORFs"
    logger.info(msg)

    exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                               extracted_orf_exons,
                                               min_a_overlap=1,
                                               min_b_overlap=1)

    exact_match_orf_ids = {o.b_info for o in exact_matches}
    m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids)
    extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches]

    m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids)
    extracted_orfs.loc[m_canonical, 'orf_type'] = 'canonical'

    msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids))
    logger.info(msg)

    msg = "Finding ORFs which are extended versions of the canonical ORFs"
    logger.info(msg)

    extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                  extracted_orf_exons,
                                                  min_a_overlap=1)

    # make sure the "end"s match before calling something an extended match
    extended_match_ids = {
        m.b_info
        for m in tqdm.tqdm(extended_matches)
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_extended_matches]

    m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids)

    l = "{}canonical_extended".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_extended, 'orf_type'] = l

    msg = "Found {} canonical_extended ORFs".format(len(extended_match_ids))
    logger.info(msg)

    msg = "Finding ORFs which are truncated versions of the canonical ORFs"
    logger.info(msg)

    truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    # make sure the "end"s match before calling something a truncated match
    truncated_match_ids = {
        m.b_info
        for m in tqdm.tqdm(truncated_matches)
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_truncated_matches]

    m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids)

    l = "{}canonical_truncated".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_truncated, 'orf_type'] = l

    msg = "Found {} canonical_truncated ORFs".format(len(truncated_match_ids))
    logger.info(msg)

    msg = ("Labeling ORFs which are completely covered by a canonical ORF but "
           "do not share its stop codon")
    logger.info(msg)

    # anything in "truncated matches" which *does not* share a stop codon with
    # the match is a "within" orf
    within_ids = {
        m.b_info
        for m in truncated_matches if m.b_info not in truncated_match_ids
    }

    m_within_matches = extracted_orf_exons['id'].isin(within_ids)
    extracted_orf_exons = extracted_orf_exons[~m_within_matches]

    m_within = extracted_orfs['id'].isin(within_ids)

    l = "{}within".format(args.label_prefix)
    extracted_orfs.loc[m_within, 'orf_type'] = l

    msg = "Found {} within ORFs".format(len(within_ids))
    logger.info(msg)

    msg = "Finding out-of-frame overlaps"
    logger.info(msg)
    out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                      extracted_orf_exons)

    msg = "Finding leader overlaps"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons)

    msg = "Finding trailer overlaps"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons)

    msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a "
           "canonical ORF and annotated leaders or trailers")
    logger.info(msg)

    out_of_frame_ids = {m.b_info for m in out_of_frame_matches}
    leader_ids = {m.b_info for m in leader_matches}
    trailer_ids = {m.b_info for m in trailer_matches}

    leader_overlap_ids = out_of_frame_ids & leader_ids
    trailer_overlap_ids = out_of_frame_ids & trailer_ids

    m_leader_overlap_matches = extracted_orf_exons['id'].isin(
        leader_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches]

    m_trailer_overlap_matches = extracted_orf_exons['id'].isin(
        trailer_overlap_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches]

    m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids)

    l = "{}five_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = l

    m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids)

    l = "{}three_prime_overlap".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = l

    msg = "Found {} five_prime_overlap ORFs".format(len(leader_overlap_ids))
    logger.info(msg)

    msg = "Found {} three_prime_overlap ORFs".format(len(trailer_overlap_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 5' leaders"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons,
                                                min_b_overlap=1)
    leader_ids = {m.b_info for m in leader_matches}

    m_leader_matches = extracted_orf_exons['id'].isin(leader_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_matches]

    m_five_prime = extracted_orfs['id'].isin(leader_ids)

    l = "{}five_prime".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime, 'orf_type'] = l

    msg = "Found {} five_prime ORFs".format(len(leader_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 3' trailers"
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons,
                                                 min_b_overlap=1)
    trailer_ids = {m.b_info for m in trailer_matches}

    m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_matches]

    m_three_prime = extracted_orfs['id'].isin(trailer_ids)

    l = "{}three_prime".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime, 'orf_type'] = l

    msg = "Found {} three_prime ORFs".format(len(trailer_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within annotated, noncoding transcripts"
    logger.info(msg)

    noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    noncoding_ids = {m.b_info for m in noncoding_matches}

    m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids)
    extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches]

    m_noncoding = extracted_orfs['id'].isin(noncoding_ids)

    l = "{}noncoding".format(args.label_prefix)
    extracted_orfs.loc[m_noncoding, 'orf_type'] = l

    msg = "Found {} noncoding ORFs".format(len(noncoding_ids))
    logger.info(msg)

    # all of the remaining ORFs fall into the "suspect" category
    suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']}

    m_suspect = extracted_orfs['id'].isin(suspect_ids)

    l = "{}suspect".format(args.label_prefix)
    extracted_orfs.loc[m_suspect, 'orf_type'] = l

    msg = "Found {} \"suspect\" ORFs".format(len(suspect_ids))
    logger.info(msg)

    m_no_orf_type = extracted_orfs['orf_type'].isnull()

    msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type))
    logger.info(msg)

    msg = "Writing ORFs with types to disk"
    logger.info(msg)

    fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
    extracted_orfs = extracted_orfs[fields]
    extracted_orfs = bed_utils.sort(extracted_orfs)

    bed_utils.write_bed(extracted_orfs, args.out)
Exemple #47
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts all of the ORFs from the given transcripts. "
        "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives "
        "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG "
        "output for this script is _very_ verbose. It is not recommended to run this "
        "script with that logging level.")

    parser.add_argument('transcripts_bed',
                        help="The bed12 file containing the "
                        "transcript information")

    parser.add_argument('transcripts_fasta',
                        help="The fasta file containing the "
                        "spliced transcript sequences")

    parser.add_argument('out', help="The output (bed12+1 gz) file")

    parser.add_argument('--start-codons',
                        help="A list of codons which will be "
                        "treated as start codons when extracting ORFs",
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help="A list of codons which will be "
                        "treated as stop codons when extracting ORFs",
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)

    msg = "Removing duplicate ORFs"
    logger.info(msg)

    orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a bar chart which shows the count of "
        "each ORF type in a given BED12+ file. Optionally, the ORFs can be "
        "grouped into similar types.")

    parser.add_argument('orfs', help="The BED12+ file with the ORFs")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('--title', help="The title to use for the plot", 
        default=default_title)

    parser.add_argument('--use-groups', help="If this flag is given, the ORFs "
        "will be grouped", action='store_true')

    parser.add_argument('--fontsize', default=default_fontsize)
    parser.add_argument('--legend-fontsize', default=default_legend_fontsize)
    parser.add_argument('--ymax', type=int, default=default_ymax)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()

    msg = "Reading bed file"
    logger.info(msg)

    bed = bed_utils.read_bed(args.orfs)

    if args.use_groups:
        bed['orf_type_group'] = bed['orf_type'].map(
            ribo_utils.orf_type_labels_reverse_mapping)

        orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type_group'].map(
            ribo_utils.orf_type_labels_display_name_map)
    else:
        orf_type_counts = bed.groupby(['orf_type', 'strand']).size()
        orf_type_counts = orf_type_counts.reset_index(name="count")
        orf_type_counts['display_name'] = orf_type_counts['orf_type'].map(
            ribo_utils.orf_type_display_name_map)

    msg = "Creating the bar chart"

    color = sns.palettes.color_palette("Set3", n_colors=3)

    fig, ax = plt.subplots(figsize=(9,5))
    sns.barplot(
        x="display_name",
        y="count",
        hue="strand",
        data=orf_type_counts,
        ax=ax,
        zorder=-1,
        palette='Set3',
        log=True
    )

    sns.despine()

    ax.legend(
        loc='upper right', 
        bbox_to_anchor=(1.0, 0.95), 
        fontsize=args.legend_fontsize, 
        frameon=True, 
        framealpha=0.9,
        title="Strand"
    )
    mpl_utils.set_legend_title_fontsize(ax, args.fontsize)

    #ax.set_yscale('log')
    #ax.set_ylim((1, args.ymax))

    ax.set_ylabel("Number of ORFs", fontsize=args.fontsize)
    ax.set_xlabel("", fontsize=0)

    # rotate the ORF type names
    mpl_utils.set_ticklabels_fontsize(ax, args.fontsize)
    mpl_utils.set_ticklabel_rotation(ax, axis='x', rotation=90)

    # place the ORF type names in the middle of the bar
    for ticklabel in ax.xaxis.get_ticklabels():    
        p = ticklabel.get_position()
        ticklabel.set_position((p[0], 0.1))
        ticklabel.set_verticalalignment('bottom')
        
    if args.title is not None:
        ax.set_title(args.title, fontsize=args.fontsize)
        
    if args.out is not None:
        fig.savefig(args.out, bbox_inches='tight')
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script trains a model to predict the runtime for a "
        "solver from an ASlib scenario using autosklearn. It assumes an "
        "\"outer\" cross-validation strategy, and it only trains a model for "
        "the indicated folds and solvers. It then writes the learned model to "
        "disk. It *does not* collect any statistics, make predictions ,etc.")

    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('out', help="A template string for the filenames for "
        "the learned models. They are written with joblib.dump, so they need "
        "to be read back in with joblib.load. ${solver} and ${fold} are the "
        "template part of the string. It is probably necessary to surround "
        "this argument with single quotes in order to prevent shell "
        "replacement of the template parts.")

    parser.add_argument('--config', help="A (yaml) config file which specifies "
        "options controlling the learner behavior")

    parser.add_argument('--solvers', help="The solvers for which models will "
        "be learned. By default, models for all solvers are learned", 
        nargs='*', default=[])

    parser.add_argument('--folds', help="The outer-cv folds for which a model "
        "will be learned. By default, models for all folds are learned", 
        type=int, nargs='*', default=[])

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use "
        "for parallel solver/fold training", type=int, 
        default=default_num_cpus)
    
    parser.add_argument('--num-blas-threads', help="The number of threads to "
        "use for parallelizing BLAS. The total number of CPUs will be "
        "\"num_cpus * num_blas_cpus\". Currently, this flag only affects "
        "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus)

    parser.add_argument('--do-not-update-env', help="By default, num-blas-threads "
        "requires that relevant environment variables are updated. Likewise, "
        "if num-cpus is greater than one, it is necessary to turn off python "
        "assertions due to an issue with multiprocessing. If this flag is "
        "present, then the script assumes those updates are already handled. "
        "Otherwise, the relevant environment variables are set, and a new "
        "processes is spawned with this flag and otherwise the same "
        "arguments. This flag is not inended for external users.",
        action='store_true')

    automl_utils.add_automl_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    folds = args.folds
    if len(folds) == 0:
        folds = range(1, 11)

    for f in folds:
        math_utils.check_range(f, 1, 10, variable_name="fold")

    # and which solvers
    msg = "Reading ASlib scenario"
    logger.info(msg)
    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    # ensure the selected solver is present
    solvers = args.solvers
    if len(solvers) == 0:
        solvers = scenario.algorithms

    for solver in solvers:
        if solver not in scenario.algorithms:
            solver_str = ','.join(scenario.algorithms)
            msg = ("[train-auto-sklear]: the solver is not present in the "
                "ASlib scenario. given: {}. choices: {}".format(solver, 
                solver_str))
            raise ValueError(msg)

    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    # everything is present, so update the environment variables and spawn a
    # new process, if necessary
    if not args.do_not_update_env:
        ###
        #
        # There is a lot going on with settings these environment variables.
        # please see the following references:
        #
        #   Turning off assertions so we can parallelize sklearn across
        #   multiple CPUs for different solvers/folds
        #       https://github.com/celery/celery/issues/1709
        #
        #   Controlling OpenBLAS threads
        #       https://github.com/automl/auto-sklearn/issues/166
        #
        #   Other environment variables controlling thread usage
        #       http://stackoverflow.com/questions/30791550
        #
        ###
        
        # we only need to turn off the assertions if we parallelize across cpus
        if args.num_cpus > 1:
            os.environ['PYTHONOPTIMIZE'] = "1"

        # openblas
        os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads)
        
        # mkl blas
        os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads)

        # other stuff from the SO post
        os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads)
        os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads)

        cmd = ' '.join(shlex.quote(a) for a in sys.argv)
        cmd += " --do-not-update-env"
        shell_utils.check_call(cmd)
        return

    msg = "Learning regressors"
    logger.info(msg)

    it = itertools.product(solvers, folds)
    regressors = parallel.apply_parallel_iter(
        it,
        args.num_cpus,
        _outer_cv,
        args,
        config,
        progress_bar=True
    )
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Create a bar chart for either a single or all samples "
        "from the output of get-read-length-distribution.")
    parser.add_argument('distribution', help="The (csv) length distribution "
        "file")
    parser.add_argument('basename', help="The basename of the read counts to "
        "visualize, or ALL")
    parser.add_argument('out', help="The output (image) file")
    
    parser.add_argument('--title', help="The title of the plot", 
        default=default_title)

    parser.add_argument('--min-read-length', type=int, 
        default=default_min_read_length)
    parser.add_argument('--max-read-length', type=int, 
        default=default_max_read_length)
    parser.add_argument('--ymax', help="The maximum for the y-axis", type=int,
        default=default_ymax)
    
    parser.add_argument('--fontsize', help="The font size to use for most of "
        "the text in the plot", type=int, default=default_fontsize)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading the length distributions"
    logger.info(msg)
    distribution_df = pd.read_csv(args.distribution)

    msg = "Filtering lengths and samples based on parameters"
    logger.info(msg)

    m_sample = np.full(len(distribution_df), True)
    if args.basename != "ALL":
        m_sample = distribution_df['basename'] == args.basename
        
    m_min_read_length = np.full(len(distribution_df), True)
    if args.min_read_length is not None:
        m_min_read_length = distribution_df['length'] >= args.min_read_length

    m_max_read_length = np.full(len(distribution_df), True)
    if args.max_read_length is not None:
        m_max_read_length = distribution_df['length'] <= args.max_read_length

    m_to_view = m_min_read_length & m_max_read_length & m_sample
    distribution_df = distribution_df[m_to_view]

    # guess ymax if it is not given
    if args.ymax is None:
        msg = "Guessing ymax"
        logger.info(msg)
        args.ymax = distribution_df['count'].max()

    msg = "Creating the plots"
    logger.info(msg)

    g = sns.factorplot(
        x='length',
        y='count',
        col='basename',
        data=distribution_df,
        kind="bar",
        col_wrap=3,
        size=5,
        aspect=1.5
    )

    g.set_titles("{col_name}")
    g.set_xlabels('Length', fontsize=args.fontsize)
    g.set_ylabels('Count', fontsize=args.fontsize)

    for ax in g.axes.flat:
        # hack for punch-p data
        text = ax.title.get_text()
        
        text = text.replace(".riboseq.cell-type-hela.rep-", " (Kim, ")
        text = text.replace(".riboseq.cell-type-hela-s3.rep-", " (")
        text = text.replace(".GRCh38_85.fastq", ")")
        ax.title.set_text(text)
        
        ax.set_yscale('log')
        ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0e'))

        ax.set_ylim((1, args.ymax))
        mpl_utils.remove_top_and_right_splines(ax)

        mpl_utils.set_title_fontsize(ax, args.fontsize)
        mpl_utils.set_ticklabels_fontsize(ax, args.fontsize)
        mpl_utils.set_ticklabel_rotation(ax, 90)
        mpl_utils.hide_first_y_tick_label(ax)
        
        
    if args.title is not None:
        g.fig.suptitle(args.title, size=args.fontsize, y=1.01)

    msg = "Writing the plot to disk"
    logger.info(msg)
    g.savefig(args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Visualize the read counts at each filtering step.")

    parser.add_argument('alignment_counts', help="The (csv) alignment counts "
        "(created with get-all-filtering-counts)")
    parser.add_argument('out', help="The output image file")

    parser.add_argument('--alignment-counts-order', help="The fields to use "
        "from the alignment_counts file. The order should range from the "
        "least strict filter to most strict filter.", nargs='+', 
        default=default_alignment_counts_order)

    parser.add_argument('--alignment-counts-names', help="The text to use for "
        "the kept fields in the plot. The order of the names must match that "
        "of --alignment-counts-order.", nargs='+', 
        default=default_alignment_counts_names)

    parser.add_argument('--without-rrna', help="If this flag is present, then "
        "a default set of fields excluding the reads mapping to ribosomal "
        "sequences will be used.", action='store_true')

    parser.add_argument('--title', help="The title of the plot", default=None)

    parser.add_argument('--fontsize', help="The font size to use for most of "
        "the text in the plot", type=int, default=default_fontsize)

    parser.add_argument('--legend-fontsize', help="The font size to use for "
        "the legend labels", type=int, default=default_legend_fontsize)

    parser.add_argument('--ymax', help="The maximum for the y-axis", type=int,
        default=default_ymax)
    parser.add_argument('--ystep', help="The step size for ticks on the "
        "y-axis.", type=int, default=default_ystep)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    
    if args.without_rrna:
        msg = "Using the default without rrna field order"
        logger.info(msg)

        args.alignment_counts_order = without_rrna_order
        args.alignment_counts_names = without_rrna_names

    # we really need the fields from most- to least-restrictive
    args.alignment_counts_order = args.alignment_counts_order[::-1]
    args.alignment_counts_names = args.alignment_counts_names[::-1]

    msg = "Reading counts"
    logger.info(msg)

    alignment_counts = pd.read_csv(args.alignment_counts)
    alignment_counts = alignment_counts.sort_values('note')

    msg = "Calculating the diff counts"
    logger.info(msg)
    alignment_diff_counts = mpl_utils.get_diff_counts(
        alignment_counts[args.alignment_counts_order])

    df = pd.DataFrame(alignment_diff_counts)
    df.columns = args.alignment_counts_names
    
    names = alignment_counts['note'].reset_index(drop=True)
    df['name'] = names

    msg = "Creating the stacked bar chart"
    logger.info(msg)

    fig, ax = plt.subplots()

    pal = sns.palettes.color_palette(
        palette="Set3",
        n_colors=len(args.alignment_counts_names)
    )

    gap = 0.15

    # if we aren't given information about the y-axis, try to guess
    if args.ymax is None:
        field = args.alignment_counts_order[-2]
        max_count = alignment_counts[field].max()

        if args.ystep > max_count:
            args.ystep = np.ceil(max_count / 4)

        args.ymax = (np.ceil(max_count / args.ystep) * args.ystep) + 1
    
    yticks = np.arange(0, args.ymax, args.ystep)

    bars = mpl_utils.create_stacked_bar_graph(
        ax,
        alignment_diff_counts,
        colors=pal,
        x_tick_labels=df['name'],
        y_ticks=yticks,
        y_tick_labels=yticks,
        gap=gap,
        end_gaps=True,
        stack_labels=args.alignment_counts_names,
        y_title='Reads',
        log=False,
        font_size=args.fontsize,
        edge_colors='0.5'
    )

    ax.legend(
        loc='upper center',
        bbox_to_anchor=(0.5, -0.6),
        ncol=3,
        fontsize=args.legend_fontsize,
        title="Filter",
        frameon=True
    )

    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0e'))
    mpl_utils.set_label_fontsize(ax, args.fontsize)
    mpl_utils.set_legend_title_fontsize(ax, args.fontsize)

    if args.title is not None:
        ax.set_title(args.title, fontsize=args.fontsize)

    msg = "Writing the plot to disk"
    logger.info(msg)
    fig.savefig(args.out, bbox_inches='tight')