Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script merges either the exons or CDS regions of all transcript "
        "isoforms into a single \"super gene isoform\". It does this based on the given "
        "GTF feature type and attribute (with defaults \"CDS\" and \"gene_id\", respectively)."
    )
    parser.add_argument('gtf', help="The GTF file")
    parser.add_argument('out', help="The output (merged) GTF file")

    parser.add_argument('--feature-type',
                        help="The type of features to merge",
                        default=default_feature_type)
    parser.add_argument('--group-attribute',
                        help="The attribute by which the features "
                        "will be merged",
                        default=default_group_attribute)

    parser.add_argument('--id-format-str',
                        help="The python format string to "
                        "use for creating the \"transcript\" identifiers",
                        default=default_id_format_str)

    parser.add_argument(
        '--chr-name-file',
        help="If this file is specified, it will "
        "be used to determine the seqname sort order. This should be the "
        "\"chrName.txt\" file created by STAR. If not present, the transcripts "
        "will be sorted alphabetically (1, 10, 11, 2, ..., KL568162.1, MT, X, Y).",
        default=default_chr_name_file)

    parser.add_argument(
        '--add-exons',
        help="If this flag is given, then all features will "
        "be duplicated, but with the feature type \"exon\". Presumably, this should be given "
        "when \"CDS\" features are merged, and the resulting GTF file will be used by STAR "
        "(or anything else expecting \"exon\"s).",
        action='store_true')

    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into which to split "
        "the features. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.",
        type=int,
        default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Reading GTF file"
    logger.info(msg)

    gtf_df = gtf_utils.read_gtf(args.gtf)

    msg = "Extracting desired features"
    logger.info(msg)
    m_feature_type = gtf_df['feature'] == args.feature_type
    gtf_feature_df = gtf_df[m_feature_type]

    msg = "Parsing GTF attributes"
    logger.info(msg)

    attributes = parallel.apply_parallel_split(gtf_feature_df,
                                               args.num_cpus,
                                               parse_attributes_group,
                                               progress_bar=True,
                                               num_groups=args.num_groups)

    attributes_df = pd.concat(attributes)
    attributes_df['end'] = attributes_df['end'].astype(int)
    attributes_df['start'] = attributes_df['start'].astype(int)

    msg = "Merging isoforms"
    logger.info(msg)

    gene_features = attributes_df.groupby(args.group_attribute)
    merged_genes = parallel.apply_parallel_groups(gene_features,
                                                  args.num_cpus,
                                                  merge_gene_group,
                                                  args.group_attribute,
                                                  args.id_format_str,
                                                  progress_bar=True)

    merged_genes_df = pd.concat(merged_genes)

    if args.add_exons:
        merged_exons = merged_genes_df.copy()
        merged_exons['feature'] = 'exon'
        merged_genes_df = pd.concat([merged_exons, merged_genes_df])

    merged_genes_df['start'] = merged_genes_df['start'].astype(int)

    # now, sort the merged isoforms

    # this is a bit of a hack, because it is actually using the sorting routine
    # for bed data frames

    # we need a dummy 'id' column for sorting, so just use the attributes
    merged_genes_df['id'] = merged_genes_df['attributes']
    merged_genes_df = bed_utils.sort(merged_genes_df,
                                     seqname_order=args.chr_name_file)

    # last, drop duplicate rows
    fields = ['seqname', 'source', 'feature', 'start', 'end', 'strand']
    merged_genes_df = merged_genes_df.drop_duplicates(subset=fields)

    gtf_utils.write_gtf(merged_genes_df, args.out, compress=False)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently. One sparse matrix file will be created for "
        "each read length. It then collects the values into a sparse tensor.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name',
                        help="The name for the dataset, used in the "
                        "created files")

    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles and read lengths")

    parser.add_argument(
        '-c',
        '--is-condition',
        help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.",
        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))

    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)

    orfs = filenames.get_orfs(config['genome_base_path'],
                              config['genome_name'],
                              note=orf_note)

    exons = filenames.get_exons(config['genome_base_path'],
                                config['genome_name'],
                                note=orf_note)

    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # check which samples to process
    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)

        # now the relevant files
        bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                        name,
                                        is_unique=is_unique,
                                        note=note)

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, name, is_unique=is_unique)

        if len(lengths) == 0:
            msg = (
                "No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting."
            )
            logger.critical(msg)
            return

        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            mtx = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                 name,
                                                 length=[length],
                                                 offset=[offset],
                                                 is_unique=is_unique,
                                                 note=note)

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str,
                cpus_str, logging_str)

            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    # now, collect them into a single file
    offsets_str = ' '.join(str(o) for o in offsets)
    lengths_str = ' '.join(str(l) for l in lengths)

    offsets_str = "--offsets {}".format(offsets_str)
    lengths_str = "--lengths {}".format(lengths_str)

    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config, args.name, args.out, is_condition_str, logging_str)

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out',
                        help="The base output directory for the latex report")

    parser.add_argument(
        '--show-unfiltered-orfs',
        help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included",
        action='store_true')

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument('--uniprot',
                        help="The uniprot ORF lengths, if available",
                        default=default_uniprot)
    parser.add_argument('--uniprot-label',
                        help="The label to use for the uniprot ORFs in "
                        "the plot",
                        default=default_uniprot_label)

    parser.add_argument('--image-type',
                        help="The format of the image files. This must be "
                        "a format usable by matplotlib.",
                        default=default_image_type)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    parser.add_argument(
        '--show-chisq',
        help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.",
        action='store_true')

    parser.add_argument('-t',
                        '--tmp',
                        help="A location for temporary files",
                        default=default_tmp)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = ['riboseq_data', 'riboseq_samples']
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())

        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(
                ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page
            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)

            i = 0
            for sample_name in sample_names:

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        sample_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None

                caption = "ORF type metagene profiles: {}".format(
                    replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        replicate_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.debug(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)
Beispiel #4
0
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
            This script uses Hamiltonian MCMC with Stan to estimate translation parameters
            for a set of regions (presumably ORFs). Roughly, it takes as input:

            (1) a set of regions (ORFs) and their corresponding profiles
            (2) a "translated" model which gives the probability that a region is translated
            (3) an "untranslated" model which gives the probability that a region is not translated

            The script first smoothes the profiles using LOWESS. It then calculates
            both the Bayes' factor (using the smoothed profile) and \chi^2 value
            (using the raw counts) for each ORF.
        """
        )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")
    parser.add_argument('regions', help="The regions (ORFs) for which predictions will "
        "be made (BED12+)")
    
    parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi "
        "square test will be performed for each ORF. This can also be a way to get the counts "
        "within each of the ORFs.", action='store_true')
    
    parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+')
    parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+')

    ### filtering options
    parser.add_argument('--orf-types', help="If values are given, then only orfs with "
        "those types are processed.", nargs='*', default=default_orf_types)
    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument('--min-length', help="ORFs with length less than this value will not "
        "be processed", type=int, default=default_min_length)
    parser.add_argument('--max-length', help="ORFs with length greater than this value will not "
        "be processed", type=int, default=default_max_length)
    parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number "
        "of reads) less than this value will not be processed.", type=float, 
        default=default_min_profile)

    ### smoothing options
    parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", 
        type=float, default=default_fraction)

    parser.add_argument('--reweighting-iterations', help="The number of reweighting "
        "iterations to use in LOWESS. Please see the statsmodels documentation for a "
        "detailed description of this parameter.", type=int, default=default_reweighting_iterations)

    ### MCMC options
    parser.add_argument('-s', '--seed', help="The random seeds to use for inference",
        type=int, default=default_seed)
    parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int,
        default=default_chains)
    parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for "
        "each chain", type=int, default=default_iterations)
    
    ### behavior options
    parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed",
        type=int, default=default_num_orfs)
    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will "
        "be written in GZip format", action='store_true')

    parser.add_argument('-g', '--num-groups', help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.", type=int, default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]
    
    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models]
    untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models]
    
    profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)

    bfs_l = parallel.apply_parallel_split(
        regions, 
        args.num_cpus,
        get_all_bayes_factors_args, 
        num_groups=args.num_groups,
        progress_bar=True
    )

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text.")
    parser.add_argument('config', help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument('--show-orf-periodicity', help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.", action='store_true')

    parser.add_argument('--show-read-length-bfs', help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.", action='store_true')

    parser.add_argument('--overwrite', help="If this flag is present, existing files will "
        "be overwritten.", action='store_true')

    parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this "
        "number of reads will not be included in the report.", type=int, 
        default=default_min_visualization_count)

    parser.add_argument('--image-type', help="The type of image types to create. This "
        "must be an extension which matplotlib can interpret.", default=default_image_type)

    parser.add_argument('-c', '--create-fastqc-reports', help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.", action='store_true')
    
    parser.add_argument('--tmp', help="If the fastqc reports are created, "
        "they will use this location for temp files", default=default_tmp)
     
    parser.add_argument('--note', help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    if args.note is not default_note:
        config['note'] = args.note
    note = config.get('note', None)
    
    sample_names = sorted(config['riboseq_samples'].keys())
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs =  [   
	'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts',
        'samtools',
        'visualize-read-filtering-counts',
        'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc','java'])
        
    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)
   
    # first, create the read filtering information... 
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
            name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        "min_metagene_profile_count", default_min_metagene_profile_count)

    min_metagene_profile_bayes_factor_mean = config.get(
        "min_metagene_profile_bayes_factor_mean", 
        default_min_metagene_profile_bayes_factor_mean)

    max_metagene_profile_bayes_factor_var = config.get(
        "max_metagene_profile_bayes_factor_var", 
        default_max_metagene_profile_bayes_factor_var)

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)
    
        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out, read_filtering_caption, label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out, "Read length distributions", 
            label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'], name, is_unique=False, note=note, 
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'], name, is_unique=True, note=note, 
                image_type=args.image_type)

            
            msg = "Looking for image file: {}".format(read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out, read_length_distribution_image, width=0.45)
            else:
                msg = "Could not find image: {}".format(read_length_distribution_image)
                logger.warning(msg)
                
                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)
            

            msg = "Looking for image file: {}".format(unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out, unique_read_length_distribution_image, width=0.45)
            else:
                msg = "Could not find image: {}".format(unique_read_length_distribution_image)
                logger.warning(msg)
            
                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

            

        latex.end_table(out)
        latex.clearpage(out)


        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))
    
            periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
                name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())
    

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used
                
                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"
                
                if length_row['highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean:
                    offset_status = "BF mean too small"

                if length_row['highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var:
                    offset_status = "BF variance too high"

                if length_row['highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"
                
                if length_row['highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'], name, image_type=args.image_type, 
                    is_unique=is_unique, length=length, note=note)
                
                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                    #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)
                               
                i += 1
                if i%2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)


                if args.show_read_length_bfs:
                    
                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'], name, image_type=args.image_type, 
                        is_unique=is_unique, length=length, note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)
                        
                    i += 1
                    if i%2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

                               
            if i%2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)
            
            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = ("Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue
                
                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'], sample_name, length=lengths, offset=offsets, 
                    is_unique=is_unique, note=note, subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand, 
                            image_type=args.image_type)


                        msg = "Looking for image file: {}".format(orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i%4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i%4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i>0) and (i%4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(), 
            args.num_cpus, 
            create_fastqc_reports, config, args)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out', help="The base output directory for the latex report")

    parser.add_argument('--show-unfiltered-orfs', help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included", action='store_true')
    
    parser.add_argument('--show-orf-periodicity', help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.", action='store_true')

    parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", 
        default=default_uniprot)
    parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in "
        "the plot", default=default_uniprot_label)
    
    parser.add_argument('--image-type', help="The format of the image files. This must be "
        "a format usable by matplotlib.", default=default_image_type)

    parser.add_argument('--overwrite', help="If this flag is present, existing files will "
        "be overwritten.", action='store_true')
        
    parser.add_argument('--note', help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note)

    parser.add_argument('--show-chisq', help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.", action='store_true')

    parser.add_argument('-t', '--tmp', help="A location for temporary files",
        default=default_tmp)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs =  [   
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart',
        'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)
    
    required_keys = [
        'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]
    
    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note
    
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    
    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())
        
        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:
            
            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = ("Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue
            
            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page            
            it = itertools.product(grouped_values, chisq_values, filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations
                
                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'], 
                    sample_name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=out_note_str, 
                    image_type=args.image_type,
                    fraction=f, 
                    reweighting_iterations=rw,
                    is_grouped=is_grouped, 
                    is_chisq=is_chisq, 
                    is_filtered=is_filtered
                )

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i%6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_types_bar_chart)
                    logger.warning(msg)



            if (i > 0) and (i%6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%6 != 0:
            latex.clearpage(out)

    
        # now, if the config file specifies replicates, create figures for those                
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values, filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                
                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'], 
                    replicate_name, 
                    length=lengths, 
                    offset=offsets, 
                    is_unique=is_unique, 
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered
                )

                
                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i%6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_types_bar_chart)
                    logger.warning(msg)


            if (i > 0) and (i%6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%6 != 0:
            latex.clearpage(out)


        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:
            
            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config, sample_name, is_unique=is_unique)
            except FileNotFoundError:
                msg = ("Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue
            
            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)
            
            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'], 
                    sample_name,
                    length=lengths,
                    offset=offsets, 
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq
                )

                if os.path.exists(orf_length_line_graph):
            
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_length_line_graph, height=0.15)

                    if i%4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_length_line_graph)
                    logger.debug(msg)


            if (i > 0) and (i%4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those  
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)
            
            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets, 
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq
                )
                
                if os.path.exists(orf_length_line_graph):
            
                    if is_first or (i%4 == 0):
                        latex.begin_figure(out)
                        is_first = False
                    
                    i += 1
                    latex.write_graphics(out, orf_length_line_graph, height=0.15)

                    if i%4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i%4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i%4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)
            
            i = 0
            for sample_name in sample_names:
                
                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = ("Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue
                
                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'], 
                        sample_name, 
                        length=lengths, offset=offsets, 
                        is_unique=is_unique, 
                        note=out_note_str,
                        fraction=f, reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, 
                            orf_type, 
                            strand, 
                            args.image_type
                        )

                        msg = "Looking for image file: {}".format(orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i%4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i%4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i%4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i%4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None
                            
                caption = "ORF type metagene profiles: {}".format(replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'], 
                        replicate_name, 
                        length=lengths, offset=offsets, 
                        is_unique=is_unique, 
                        note=out_note_str, 
                        fraction=f, reweighting_iterations=rw,
                        is_chisq=is_chisq
                    )

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, 
                            orf_type, 
                            strand, 
                            args.image_type
                        )

                        if os.path.exists(orf_type_profile):

                            if is_first or (i%4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(orf_type_profile)
                            logger.debug(msg)
                
                if (i > 0) and (i%4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i%4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts bam files to bigWig files. It is mostly "
        "a port of this script: https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py "
        "by Brad Chapman which avoids a few dependencies.\n\nThe wigToBigWig "
        "program (from UCSC tools) must be in the path.\n\nN.B. If given, the "
        "start and end coordinates must be base-0.")

    parser.add_argument('bam', help="The bam file", nargs='+')
    parser.add_argument(
        '-o',
        '--overwrite',
        help="If this flag is given, then "
        "the bigWig file will be created whether it exists or not",
        action='store_true')
    parser.add_argument('-c',
                        '--chrom',
                        help="If specified, only alignments "
                        "from this chromosome will be in the output",
                        default=default_chrom)
    parser.add_argument('-s',
                        '--start',
                        help="If specied, only alignments "
                        "from this position will be in the output",
                        default=default_start)
    parser.add_argument('-e',
                        '--end',
                        help="If specied, only alignments "
                        "up to this position will be in the output",
                        default=default_end)

    parser.add_argument('-n',
                        '--normalize',
                        help="If this flag is given, "
                        "then values will be normalized to reads per million",
                        action='store_true')

    parser.add_argument(
        '-t',
        '--use-tempfile',
        help="If this flag is given, "
        "then a temp file will be used to avoid permission issues",
        action='store_true')

    parser.add_argument('-k',
                        '--keep-wig',
                        help="If this flag is given, then "
                        "the wiggle file will not be deleted",
                        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['wigToBigWig']
    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    parallel.apply_parallel_iter(args.bam,
                                 args.num_cpus,
                                 bam_to_wiggle,
                                 args,
                                 progress_bar=True)
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script constructs the profile for each ORF. It "
        "first adjusts the mapped read positions to properly align with "
        "the P-sites. Second, it uses a custom chrom-sweep algorithm to "
        "find the coverage of each position in each exon of each ORF. Finally, "
        "the ORF exons are glued together to find the profile of the entire ORF."
    )

    parser.add_argument('bam',
                        help="The bam file including filtered (unique, "
                        "etc.) alignments")
    parser.add_argument('orfs', help="The (bed12) file containing the ORFs")
    parser.add_argument('exons', help="The (bed6+2) file containing the exons")
    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles")

    parser.add_argument(
        '-l',
        '--lengths',
        help="If any values are given, "
        "then only reads which have those lengths will be included in the "
        "signal construction.",
        type=int,
        default=default_lengths,
        nargs='*')
    parser.add_argument(
        '-o',
        '--offsets',
        help="The 5' end of reads will be "
        "shifted by this amount. There must be one offset value for each "
        "length (given by the --lengths argument.",
        type=int,
        default=default_offsets,
        nargs='*')

    parser.add_argument('-k',
                        '--num-exons',
                        help="If  k>0, then only the "
                        "first k exons will be processed.",
                        type=int,
                        default=default_num_exons)
    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into "
        "which to split the exons. More groups means the progress bar is "
        "updated more frequently but incurs more overhead because of the "
        "parallel calls.",
        type=int,
        default=default_num_groups)

    parser.add_argument('--seqname-prefix',
                        help="If present, this string "
                        "will be prepended to the seqname field of the ORFs.",
                        default=default_seqname_prefix)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    # make sure the number of lengths and offsets match
    if len(args.lengths) != len(args.offsets):
        msg = "The number of --lengths and --offsets do not match."
        raise ValueError(msg)

    # make sure the necessary files exist
    required_files = [args.bam, args.orfs, args.exons]
    msg = "[extract-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Finding P-sites"
    logger.info(msg)

    p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets)

    # we do not need the data frame anymore, so save some memory
    msg = "Reading exons"
    logger.info(msg)
    exons = bed_utils.read_bed(args.exons)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bed_utils.read_bed(args.orfs)

    if len(args.seqname_prefix) > 0:
        orfs['seqname'] = args.seqname_prefix + orfs['seqname']
        exons['seqname'] = args.seqname_prefix + exons['seqname']

    if args.num_exons > 0:
        exons = exons.head(args.num_exons)

    num_orfs = orfs['orf_num'].max() + 1
    max_orf_len = orfs['orf_len'].max()

    msg = "Adding the ORF index to the exons"
    logger.info(msg)

    orf_fields = ['id', 'orf_num']
    exons_orfs = exons.merge(orfs[orf_fields], on='id')

    msg = "Splitting exons and P-sites"
    logger.info(msg)
    exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups)

    exons_dfs = []
    psites_dfs = []

    for group_index, exon_group in exon_groups:
        # pull out only the p-sites that come from these chromosomes
        seqnames = set(exon_group['seqname'].unique())
        m_psites = p_sites['seqname'].isin(seqnames)

        exons_dfs.append(exon_group)
        psites_dfs.append(p_sites[m_psites])

    # we no longer need the full list of psites
    del p_sites
    del exons_orfs
    del exon_groups
    del exons
    gc.collect()
    exons_psites = zip(exons_dfs, psites_dfs)

    msg = "Finding all P-site intersections"
    logger.info(msg)

    sum_profiles = parallel.apply_parallel_iter(exons_psites,
                                                args.num_cpus,
                                                get_all_p_site_intersections,
                                                num_orfs,
                                                max_orf_len,
                                                progress_bar=True,
                                                total=args.num_groups)

    msg = "Combining the ORF profiles into one matrix"
    logger.info(msg)

    f = lambda x, y: x + y

    sum_profiles = functools.reduce(f, sum_profiles)
    sum_profiles_lil = sum_profiles.tolil()

    msg = "Flipping the reverse strand profiles"
    logger.info(msg)

    m_reverse = orfs['strand'] == '-'
    reverse_orfs = orfs[m_reverse]

    for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()):
        orf_num = reverse_orf['orf_num']

        if sum_profiles[orf_num].sum() == 0:
            continue

        orf_len = reverse_orf['orf_len']
        dense = utils.to_dense(sum_profiles, orf_num, length=orf_len)
        dense = dense[::-1]
        sum_profiles_lil[orf_num, :orf_len] = dense

    msg = "Writing the sparse matrix to disk"
    logger.info(msg)
    math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs bowtie2 on all of the provided input files "
        "using the given index. By default, it does not save the alignments, "
        "aligned reads or unaligned reads. The respective flags must be given "
        "to retain the desired entities.")

    parser.add_argument('index', help="The bowtie2 index")
    parser.add_argument('out', help="The output directory")
    parser.add_argument('fastq', help="The fastq files", nargs='+')

    parser.add_argument('-a', '--alignments', help="If this flag is present, "
        "the alignments will be present in the output folder", action='store_true')
    parser.add_argument('--un-gz', help="If this flag is present, then the "
        "unaligned reads will be present in the output folder", action='store_true')
    parser.add_argument('--al-gz', help="If this flag is present, then the "
        "aligned reads will be present in the output folder", action='store_true')
    
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['bowtie2', 'call-program']
    shell_utils.check_programs_exist(programs)

    if not os.path.exists(args.out):
        if not args.do_not_call:
            msg = "Creating output directory: {}".format(args.out)
            logger.info(msg)
            os.makedirs(args.out)

    for fastq in args.fastq:

        basename = utils.get_basename(fastq)

        out_files = []

        out = utils.abspath("dev","null") # we do not care about the alignments
        out_str = "-S {}".format(out)
        if args.alignments:
            n = "{}.bam".format(basename)
            out = os.path.join(args.out, n)
            out_str = "-S {}".format(out)
            out_files.append(out)

        un_gz_str = ""
        if args.un_gz:
            n = "{}.un-al.fastq.gz".format(basename)
            n = os.path.join(args.out, n)
            un_gz_str = "--un-gz {}".format(n)
            out_files.append(n)

        al_gz_str = ""
        if args.al_gz:
            n = "{}.al.fastq.gz".format(basename)
            n = os.path.join(args.out, n)
            al_gz_str = "--al-gz {}".format(n)
            out_files.append(n)

        cmd = "call-program bowtie2 -p {} --very-fast -x {} -U {} {} {} {}".format(
            args.num_cpus, args.index, fastq, out_str, un_gz_str, al_gz_str)

        slurm.check_sbatch(cmd, args=args)
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + [
                'orf_num', 'orf_len', 'orf_type'
            ]
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

    else:
        # finally, make sure our files are named correctly

        if os.path.exists(annotated_orfs):
            utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            utils.create_symlink(annotated_exons_file, exons_file, call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")
    
    parser.add_argument('--peptide-filter-field', help="The field to use for "
        "filtering the peptides from MaxQuant", default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value', help="All peptides with a value "
        "greater than the filter value will be removed", type=float, 
        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator', help="The separator in the "
        "peptide file", default=default_peptide_separator)

    parser.add_argument('--note', help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.", 
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    programs = [
        'get-orf-peptide-matches'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files',
        'peptide_cell_type_analysis',
        'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items():
        if cell_type not in cell_types:
            msg = ("Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue
            
        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                format(cell_type_protein))
            logger.warning(msg)
            continue
            
        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = ("Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue
                
            peptide_txt_file = config['peptide_files'][peptide_file]
                
            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".
                    format(peptide_txt_file))
                logger.warning(msg)
                continue
                
            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'], cell_type, peptide_file, 
                is_filtered=True, note=out_note_str)
                
            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(cell_type_protein, 
                peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, 
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This is a helper script which submits a set of samples to SLURM. It "
        "can also be used to run a set of samples sequentially. Due to limitations on "
        "the config file specification, all of the samples must use the same reference "
        "indices (i.e., genome sequence, set of ORFs, etc.).")

    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--tmp', help="The temp directory", default=default_tmp)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)

    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--profiles-only', help="If this flag is present, then only "
        "the pre-processing part of the pipeline will be called, i.e. profiles "
        "will be created for each sample specified in the config file, but no predictions"
        "will be made.", action='store_true')

    parser.add_argument('--merge-replicates', help="If this flag is present, then "
        "the ORF profiles from the replicates will be merged before making the final "
        "predictions", action='store_true')

    parser.add_argument('--run-replicates', help="If this flag is given with the "
        "--merge-replicates flag, then both the replicates *and* the individual "
        "samples will be run. This flag has no effect if --merge-replicates is not "
        "given.", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
    
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [
                    'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles',
                    'estimate-orf-bayes-factors',
                    'select-final-prediction-set',
                    'create-orf-profiles',
                    'predict-translated-orfs',
                    'run-rpbp-pipeline'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
                        'riboseq_data',
                        'riboseq_samples',
                        'ribosomal_index',
                        'star_index',
                        'genome_base_path',
                        'genome_name',
                        'fasta',
                        'gtf'
                    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # check if we only want to create the profiles, in this case
    # we call run-rpbp-pipeline with the --profiles-only option
    profiles_only_str = ""
    if args.profiles_only:
        args.merge_replicates = False
        profiles_only_str = "--profiles-only"
        msg = ("The --profiles-only option was given, this will override --merge-replicates "
               "and/or --run-replicates, if these options were also given!")
        logger.info(msg)

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles, but we still make predictions
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = ("The --run-replicates option was given without the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)
    
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)
    
    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    
    # collect the job_ids in case we are using slurm and need to merge replicates
    job_ids = []

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, 
            args.config, 
            sample_name, 
            args.num_cpus, 
            tmp_str, 
            do_not_call_str, 
            overwrite_str, 
            logging_str, 
            star_str, 
            profiles_only_str,
            flexbar_option_str,
            keep_intermediate_str,
            mem_str
        )

        job_id = slurm.check_sbatch(cmd, args=args)

        job_ids.append(job_id)

    # now, if we are running the "standard" pipeline, we are finished
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):
    
        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, 
            condition_name, 
            args.num_cpus, 
            do_not_call_str, 
            overwrite_str, 
            logging_str, 
            merge_replicates_str
        )

        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script downloads short read archive runs (i.e., SRR) files "
        "over ftp. It only requires the run number. It also converts the files from "
        "the .sra format to .fastq.gz files. It then deletes the .sra file.")

    parser.add_argument(
        'srr',
        help="A csv file containing the SRR accessions to "
        "download. Optionally, it can also include whether the samples are paired-"
        "end or not.")
    parser.add_argument('outdir', help="The location for the fastq.gz files")

    parser.add_argument('-a',
                        '--accession-field',
                        help="The name of the column "
                        "containing the SRR identifiers",
                        default=default_accession_field)

    parser.add_argument('-p',
                        '--paired-field',
                        help="The name of the column "
                        "indicating whether the sample is paired-end",
                        default=default_paired_field)

    parser.add_argument(
        '-v',
        '--paired-values',
        help="The exact string values in "
        "the paired-field which indicate the sample is paired-end",
        nargs="*",
        default=default_paired_values)

    parser.add_argument('-s',
                        '--source',
                        help="The server from which the files "
                        "will be downloaded",
                        choices=source_choices,
                        default=default_source)

    parser.add_argument(
        '--overwrite',
        help="If this flag is given, then existing "
        "files will be re-downloaded. Otherwise, if either the .sra or .fastq.gz "
        "file already exists, then the sra file will not be downloaded.",
        action='store_true')

    parser.add_argument(
        '--num-downloads-per-connection',
        help="The number of "
        "files to download with each open connection. Each connections will be "
        "closed and re-opened after this many files are downloaded.",
        type=int,
        default=default_num_downloads_per_connection)

    parser.add_argument('--sep',
                        help="The separator in the SRR file",
                        default=default_sep)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['fastq-dump']
    shell_utils.check_programs_exist(programs)

    # check if we want to use slurm
    if args.use_slurm:
        msg = ("The --use-slurm option was given, so sbatch will now be used "
               "to submit to slurm.")
        logger.warning(msg)

        cmd = ' '.join(sys.argv)

        slurm.check_sbatch(cmd, args=args)

        # and quit!
        return

    msg = "Reading SRR list"
    logger.info(msg)

    srr = pd.read_csv(args.srr, sep=args.sep)

    parallel.apply_parallel_split(srr, args.num_cpus, process_files, args)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text."
    )
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument(
        '--show-read-length-bfs',
        help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.",
        action='store_true')

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument('--min-visualization-count',
                        help="Read lengths with fewer than this "
                        "number of reads will not be included in the report.",
                        type=int,
                        default=default_min_visualization_count)

    parser.add_argument('--image-type',
                        help="The type of image types to create. This "
                        "must be an extension which matplotlib can interpret.",
                        default=default_image_type)

    parser.add_argument(
        '-c',
        '--create-fastqc-reports',
        help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.",
        action='store_true')

    parser.add_argument('--tmp',
                        help="If the fastqc reports are created, "
                        "they will use this location for temp files",
                        default=default_tmp)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    if args.note is not None:
        config['note'] = args.note

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts', 'samtools',
        'visualize-read-filtering-counts', 'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc', 'java'])

    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    config = yaml.load(open(args.config))

    if args.note is not default_note:
        config['note'] = args.note

    note = config.get('note', None)

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create the read filtering information
    create_read_filtering_plots(args.config, config, args)

    min_metagene_profile_count = config.get(
        "min_metagene_profile_count", default_min_metagene_profile_count)

    min_metagene_profile_bayes_factor_mean = config.get(
        "min_metagene_profile_bayes_factor_mean",
        default_min_metagene_profile_bayes_factor_mean)

    max_metagene_profile_bayes_factor_var = config.get(
        "max_metagene_profile_bayes_factor_var",
        default_max_metagene_profile_bayes_factor_var)

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    sample_names = sorted(config['riboseq_samples'].keys())

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures

        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)

        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, height=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, height=0.45)
        latex.write_caption(out,
                            read_filtering_caption,
                            label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out,
                      "Read length distributions",
                      label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out,
                           ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=False,
                note=note,
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=True,
                note=note,
                image_type=args.image_type)

            msg = "Looking for image file: {}".format(
                read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out,
                                     read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)

            msg = "Looking for image file: {}".format(
                unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out,
                                     unique_read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    unique_read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

        latex.end_table(out)
        latex.clearpage(out)

        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))

            periodic_offsets = filenames.get_periodic_offsets(
                config['riboseq_data'], name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())

            create_figures(args.config, config, name, offsets_df, args)

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used

                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"

                if length_row[
                        'highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean:
                    offset_status = "BF mean too small"

                if length_row[
                        'highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var:
                    offset_status = "BF variance too high"

                if length_row[
                        'highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"

                if length_row[
                        'highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'],
                    name,
                    image_type=args.image_type,
                    is_unique=is_unique,
                    length=length,
                    note=note)

                title = ("length: {}. P-site offset: {}. \\newline status: {}"
                         "\n".format(length, offset, offset_status))
                latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)

                i += 1
                if i % 2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)

                if args.show_read_length_bfs:

                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'],
                        name,
                        image_type=args.image_type,
                        is_unique=is_unique,
                        length=length,
                        note=note)

                    latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)

                    i += 1
                    if i % 2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

            if i % 2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)

            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note,
                    subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base,
                            orf_type,
                            strand,
                            image_type=args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i % 4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i > 0) and (i % 4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                     args.num_cpus, create_fastqc_reports,
                                     config, args)
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts all of the ORFs from the given transcripts. "
        "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives "
        "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG "
        "output for this script is _very_ verbose. It is not recommended to run this "
        "script with that logging level.")

    parser.add_argument('transcripts_bed',
                        help="The bed12 file containing the "
                        "transcript information")

    parser.add_argument('transcripts_fasta',
                        help="The fasta file containing the "
                        "spliced transcript sequences")

    parser.add_argument('out', help="The output (bed12+1 gz) file")

    parser.add_argument('--start-codons',
                        help="A list of codons which will be "
                        "treated as start codons when extracting ORFs",
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help="A list of codons which will be "
                        "treated as stop codons when extracting ORFs",
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)

    msg = "Removing duplicate ORFs"
    logger.info(msg)

    orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for "
                        "filtering the peptides from MaxQuant",
                        default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value "
                        "greater than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the "
                        "peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    programs = ['get-orf-peptide-matches']
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(
        args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(
        args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict,
                                                      'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items(
    ):
        if cell_type not in cell_types:
            msg = (
                "Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue

        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                   format(cell_type_protein))
            logger.warning(msg)
            continue

        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = (
                    "Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue

            peptide_txt_file = config['peptide_files'][peptide_file]

            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".format(
                    peptide_txt_file))
                logger.warning(msg)
                continue

            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'],
                cell_type,
                peptide_file,
                is_filtered=True,
                note=out_note_str)

            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(
                cell_type_protein, peptide_txt_file, peptide_matches,
                num_cpus_str, peptide_filter_field_str,
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
            This script uses Hamiltonian MCMC with Stan to estimate translation parameters
            for a set of regions (presumably ORFs). Roughly, it takes as input:

            (1) a set of regions (ORFs) and their corresponding profiles
            (2) a "translated" model which gives the probability that a region is translated
            (3) an "untranslated" model which gives the probability that a region is not translated

            The script first smoothes the profiles using LOWESS. It then calculates
            both the Bayes' factor (using the smoothed profile) and \chi^2 value
            (using the raw counts) for each ORF.
        """
        )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")
    parser.add_argument('regions', help="The regions (ORFs) for which predictions will "
        "be made (BED12+)")
    
    parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi "
        "square test will be performed for each ORF. This can also be a way to get the counts "
        "within each of the ORFs.", action='store_true')
    
    parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+')
    parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+')

    ### filtering options
    parser.add_argument('--orf-types', help="If values are given, then only orfs with "
        "those types are processed.", nargs='*', default=default_orf_types)
    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument('--min-length', help="ORFs with length less than this value will not "
        "be processed", type=int, default=default_min_length)
    parser.add_argument('--max-length', help="ORFs with length greater than this value will not "
        "be processed", type=int, default=default_max_length)
    parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number "
        "of reads) less than this value will not be processed.", type=float, 
        default=default_min_profile)

    ### smoothing options
    parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", 
        type=float, default=default_fraction)

    parser.add_argument('--reweighting-iterations', help="The number of reweighting "
        "iterations to use in LOWESS. Please see the statsmodels documentation for a "
        "detailed description of this parameter.", type=int, default=default_reweighting_iterations)

    ### MCMC options
    parser.add_argument('-s', '--seed', help="The random seeds to use for inference",
        type=int, default=default_seed)
    parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int,
        default=default_chains)
    parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for "
        "each chain", type=int, default=default_iterations)
    
    ### behavior options
    parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed",
        type=int, default=default_num_orfs)
    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will "
        "be written in GZip format", action='store_true')

    parser.add_argument('-g', '--num-groups', help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.", type=int, default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]
    
    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models]
    untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models]
    
    profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)
    
    with suppress_stdout_stderr():
        
        bfs_l = parallel.apply_parallel_split(
            regions, 
            args.num_cpus,
            get_all_bayes_factors_args, 
            num_groups=args.num_groups,
            progress_bar=True
        )

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script runs the Rp-Bp and Rp-chi pipelines on a given sample. "
        "It requires a YAML config file that includes a number of keys. Please see the "
        "documentation for a complete description.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp',
                        help="The temp directory",
                        default=default_tmp)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="If this flag is present, then only "
                        "the ORF profiles will be created",
                        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path',
        'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # now, check if we want to use slurm
    msg = "use_slurm: {}".format(args.use_slurm)
    logger.debug(msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    note_str = config.get('note', None)

    # the first step is the standard riboseq preprocessing

    # handle do_not_call so that we _do_ call the preprocessing script,
    # but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    # for a sample, we first create its filtered genome profile

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
           .format(args.raw_data, args.config, args.name, args.num_cpus,
                   mem_str, do_not_call_str, overwrite_str, logging_str,
                   star_str, tmp_str, flexbar_format_option_str,
                   keep_intermediate_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(
        args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str,
        logging_str))
    shell_utils.check_call(cmd)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script runs the Rp-Bp and Rp-chi pipelines on a given sample. "
        "It requires a YAML config file that includes a number of keys. Please see the "
        "documentation for a complete description.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp', help="The temp directory", default=default_tmp)

    parser.add_argument('--flexbar-options', help="A space-delimited list of options to"
        "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\""
        "If specified, flexbar options will override default settings.", nargs='*', type=str)
    
    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')

    parser.add_argument('--profiles-only', help="If this flag is present, then only "
        "the ORF profiles will be created", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.", action='store_true')
           
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call


    # check that all of the necessary programs are callable
    programs =  [
                    'flexbar',
                    args.star_executable,
                    'samtools',
                    'bowtie2',
                    'create-base-genome-profile',
                    'remove-multimapping-reads',
                    'extract-metagene-profiles',
                    'estimate-metagene-profile-bayes-factors',
                    'select-periodic-offsets',
                    'extract-orf-profiles',
                    'estimate-orf-bayes-factors',
                    'select-final-prediction-set',
                    'create-orf-profiles',
                    'predict-translated-orfs'
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   
                        'riboseq_data',
                        'ribosomal_index',
                        'star_index',
                        'genome_base_path',
                        'genome_name',
                        'fasta',
                        'gtf'
                    ]
    utils.check_keys_exist(config, required_keys)

    
    # now, check if we want to use slurm
    msg = "use_slurm: {}".format(args.use_slurm)
    logger.debug(msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    note_str = config.get('note', None)

    # the first step is the standard riboseq preprocessing
    
    # handle do_not_call so that we _do_ call the preprocessing script, 
    # but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    # for a sample, we first create its filtered genome profile
    
    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_option_str = ""
    if args.flexbar_options is not None:
        flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"'
            for flx_op in args.flexbar_options))

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(args.raw_data, 
            args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, 
            logging_str, star_str, tmp_str, flexbar_option_str, keep_intermediate_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(args.config, 
            args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str))
    shell_utils.check_call(cmd)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates all of the files necessary for downstream "
        "analysis performed with the rpbp package.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--overwrite', help="If this flag is present, existing files "
        "will be overwritten.", action='store_true')
    
    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs =  [
                 'extract-orf-coordinates',
                 'label-orfs',
                 'bowtie2-build-s',
                 'split-bed12-blocks',
                 'gtf-to-bed12',
                 args.star_executable
                ]
    shell_utils.check_programs_exist(programs)

    
    required_keys = [   'genome_base_path',
                        'genome_name',
                        'gtf',
                        'fasta',
                        'ribosomal_fasta',
                        'ribosomal_index',
                        'star_index'
                    ]
    utils.check_keys_exist(config, required_keys)

    # check that the required files are present
    files = [
        config['gtf'],
        config['fasta'],
        config['ribosomal_fasta']
    ]

    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]

    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # now, check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return
   
    # the rrna index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], 
        config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = bio.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)
    
    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
        "--runThreadN {} --limitGenomeGenerateRAM {}".format(args.star_executable, 
        config['star_index'], config['fasta'], args.num_cpus, mem))
        
    in_files = [config['fasta']]
    out_files = star_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, 
        overwrite=args.overwrite, call=call)

    # get the main orfs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # eventually, we will use these names
    annotated_orfs = filenames.get_orfs(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), is_annotated=True,
        is_de_novo=False)
   
    annotated_exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), 
        is_annotated=True, is_de_novo=False, is_orf=True)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'], 
        config['genome_name'], note=config.get('orf_note'), is_orf=True)

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
        config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True)
   
    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, 
            is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'), is_annotated=False,
            is_de_novo=True)
       
        de_novo_exons_file = filenames.get_exons(config['genome_base_path'], 
            config['genome_name'], note=config.get('orf_note'), 
            is_annotated=False, is_de_novo=True, is_orf=True)


        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type']
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        
        exons_files = [annotated_exons_file, de_novo_exons_file]
        
        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True)
            fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start']
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                overwrite=args.overwrite, call=call)
        else:
            msg = ("Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                  "for reference and do novo annotations. Symlink to reference annotations created.")
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # finally, make sure our files are named correctly
        
        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Extract the ORF profiles for each specified read length "
        "and offset independently. One sparse matrix file will be created for "
        "each read length. It then collects the values into a sparse tensor.")

    parser.add_argument('config', help="The (json) config file")
    parser.add_argument('name', help="The name for the dataset, used in the "
        "created files")
    
    parser.add_argument('out', help="The (mtx.gz) output file containing the "
        "ORF profiles and read lengths")

    parser.add_argument('-c', '--is-condition', help="If this flag is present, "
        "then \"name\" will be taken to be a condition name. The profiles for "
        "all relevant replicates of the condition will be created.", 
        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading config file"
    logger.info(msg)
    config = yaml.load(open(args.config))
 
    # pull out what we need from the config file
    is_unique = not ('keep_riboseq_multimappers' in config)    
    seqname_str = utils.get_config_argument(config, 'seqname_prefix')
    note = config.get('note', None)
    orf_note = config.get('orf_note', None)

    
    orfs = filenames.get_orfs(
        config['genome_base_path'], 
        config['genome_name'], 
        note=orf_note
    )

    exons = filenames.get_exons(
        config['genome_base_path'], 
        config['genome_name'],
        note=orf_note,
        is_orf=True
    )
    
    # make sure the necessary files exist
    required_files = [orfs, exons]
    msg = "[create-read-length-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    # check which samples to process
    names = [args.name]
    if args.is_condition:
        riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
        names = [n for n in riboseq_replicates[args.name]]

    job_ids = []
    for name in names:

        msg = "Processing sample: {}".format(name)
        logger.info(msg)
        
        # now the relevant files
        bam = filenames.get_riboseq_bam(
            config['riboseq_data'], 
            name, 
            is_unique=is_unique, 
            note=note
        )

        # make sure the necessary files exist
        required_files = [bam]
        msg = "[create-read-length-orf-profiles]: Some input files were missing: "
        utils.check_files_exist(required_files, msg=msg)

        # get the lengths and offsets which meet the required criteria from the config file
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, 
            name, 
            is_unique=is_unique
        )

        if len(lengths) == 0:
            msg = ("No periodic read lengths and offsets were found. Try relaxing "
                "min_metagene_profile_count, min_metagene_bf_mean, "
                "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.")
            logger.critical(msg)
            return


        for length, offset in zip(lengths, offsets):
            lengths_str = "--lengths {}".format(length)
            offsets_str = "--offsets {}".format(offset)

            
            mtx = filenames.get_riboseq_profiles(
                config['riboseq_data'], 
                name, 
                length=[length], 
                offset=[offset],
                is_unique=is_unique, 
                note=note
            )

            cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format(
                bam,
                orfs,
                exons,
                mtx,
                lengths_str,
                offsets_str,
                seqname_str,
                cpus_str,
                logging_str
            )
            
            job_id = slurm.check_sbatch(cmd, args=args)

            job_ids.append(job_id)

    # now, collect them into a single file
    offsets_str = ' '.join(str(o) for o in offsets)
    lengths_str = ' '.join(str(l) for l in lengths)

    offsets_str = "--offsets {}".format(offsets_str)
    lengths_str = "--lengths {}".format(lengths_str)

    is_condition_str = ""
    if args.is_condition:
        is_condition_str = "--is-condition"

    cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format(
        args.config,
        args.name,
        args.out,
        is_condition_str,
        logging_str
    )

    slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This is a helper script which submits a set of samples to SLURM. It "
        "can also be used to run a set of samples sequentially. Due to limitations on "
        "the config file specification, all of the samples must use the same reference "
        "indices (i.e., genome sequence, set of ORFs, etc.).")

    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--tmp',
                        help="The temp directory",
                        default=default_tmp)

    parser.add_argument(
        '--flexbar-format-option',
        help="The name of the \"format\" "
        "option for flexbar. This changed from \"format\" to \"qtrim-format\" in "
        "version 2.7.",
        default=default_flexbar_format_option)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--merge-replicates',
        help="If this flag is present, then "
        "the ORF profiles from the replicates will be merged before making the final "
        "predictions",
        action='store_true')

    parser.add_argument(
        '--run-replicates',
        help="If this flag is given with the "
        "--merge-replicates flag, then both the replicates *and* the individual "
        "samples will be run. This flag has no effect if --merge-replicates is not "
        "given.",
        action='store_true')

    parser.add_argument(
        '-k',
        '--keep-intermediate-files',
        help="If this flag is given, "
        "then all intermediate files will be kept; otherwise, they will be "
        "deleted. This feature is implemented piecemeal. If the --do-not-call flag "
        "is given, then nothing will be deleted.",
        action='store_true')

    star_utils.add_star_options(parser)
    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = star_utils.get_star_options_string(args)

    config = yaml.load(open(args.config))
    call = not args.do_not_call

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index',
        'genome_base_path', 'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)

    # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles
    profiles_only_str = ""
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = (
            "The --run-replicates option was given with the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    flexbar_format_option_str = ""
    if args.flexbar_format_option is not None:
        flexbar_format_option_str = "--flexbar-format-option {}".format(
            args.flexbar_format_option)

    # collect the job_ids in case we are using slurm and need to merge replicates
    job_ids = []

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp,
                               "{}_{}_rpbp".format(sample_name, note))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, args.config, sample_name, args.num_cpus, tmp_str,
            do_not_call_str, overwrite_str, logging_str, star_str,
            profiles_only_str, flexbar_format_option_str,
            keep_intermediate_str, mem_str)

        job_id = slurm.check_sbatch(cmd, args=args)

        job_ids.append(job_id)

    # now, if we are running the "standard" pipeline, we are finished
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):

        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, condition_name, args.num_cpus, do_not_call_str,
            overwrite_str, logging_str, merge_replicates_str)

        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)