def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config))

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                       args.num_cpus, get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    utils.write_df(res_df, args.out, index=False)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script extracts all of the UTR information from a "
        "transcripts file created by gffread. It writes it into a csv file, so "
        "it can easily be imported and manipulated with pandas.")

    parser.add_argument('transcripts', help="A gffread-like fasta file")
    parser.add_argument('out', help="The (csv.gz) output file")

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    utr_info = gffread_utils.get_all_utrs(args.transcripts)

    utils.write_df(utr_info, args.out, index=False)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all of the peptides which match to multiple "
        "ORFs from the results found with get-all-orf-peptide-matches.")

    parser.add_argument('peptide_matches', help="The peptide matches file produced "
        "by get-all-orf-peptide-matches")
    parser.add_argument('out', help="A similar peptide matches file which "
        "contains only peptides which match to a unique ORF")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading peptide matches"
    logger.info(msg)

    peptide_matches = pd.read_csv(args.peptide_matches)

    msg = "Splitting the grouped matches into individual peptide matches"
    logger.info(msg)

    matches = parallel.apply_parallel(  peptide_matches, 
                                        args.num_cpus, 
                                        parse_matches, 
                                        progress_bar=True)

    msg = "Removing peptides which match to multiple ORFs"
    logger.info(msg)

    matches = utils.remove_nones(matches)
    matches = utils.flatten_lists(matches)
    matches_df = pd.DataFrame(matches)
    unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False)

    msg = "Merging the ORF-peptide matches back to single records"
    logger.info(msg)

    unique_groups = unique_matches_df.groupby('orf_id')
    merged_unique_groups = parallel.apply_parallel_groups(  unique_groups, 
                                                            args.num_cpus, 
                                                            merge_group, 
                                                            progress_bar=True)

    merged_unique_df = pd.DataFrame(merged_unique_groups)

    msg = "Re-adding the ORFs which no longer have peptide matches"
    logger.info(msg)

    m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id'])
    peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0
    peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0

    peps = [merged_unique_df, peptide_matches[~m_still_has_match]]
    merged_unique_df = pd.concat(peps)

    msg = "Writing the ORFs with unique matches to disk"
    logger.info(msg)

    utils.write_df(merged_unique_df, args.out, index=False)
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script uses the peptides.txt file from MaxQuant to determine "
        "which predicted ORFs have some proteomics evidence.\n\nIt contains "
        "some hard-coded field names.")
    parser.add_argument('predicted_proteins',
                        help="The (fasta, protein) file of "
                        "predicted ORFs")
    parser.add_argument('peptides',
                        help="The peptides.txt file produced by MaxQuant")
    parser.add_argument(
        'out',
        help="The output (csv.gz) file containing the predicted "
        "ORFs and their coverage")

    parser.add_argument('--num-cpus',
                        help="The number of CPUs to use for searching",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for filtering "
                        "the peptides from MaxQuant",
                        default=default_peptide_filter_field)
    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value greater "
                        "than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the --peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into which to split "
        "the ORFs. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.",
        type=int,
        default=default_num_groups)

    parser.add_argument(
        '--num-peptides',
        help="If n>0, then only the first n peptide "
        "sequences will be used to calculate coverage. This is for testing.",
        type=int,
        default=default_num_peptides)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[get-orf-peptide-matches]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    msg = "Reading and filtering peptides"
    logger.info(msg)

    peptides = pd.read_csv(args.peptides, sep=args.peptide_separator)
    mask_filter = peptides[
        args.peptide_filter_field] < args.peptide_filter_value
    peptides = peptides[mask_filter]
    peptide_sequences = pd.DataFrame(peptides['Sequence'])

    if args.num_peptides > 0:
        peptide_sequences = peptide_sequences.head(args.num_peptides)

    msg = "Number of filtered peptides: {}".format(len(peptide_sequences))
    logger.info(msg)

    msg = "Reading predicted ORFs into a data frame"
    logger.info(msg)

    # TODO: use read iterator
    predicted_orfs = bio.get_read_iterator(args.predicted_proteins)
    orf_ids = []
    orf_sequences = []

    for orf_id, seq in predicted_orfs:
        orf_ids.append(orf_id)
        orf_sequences.append(seq)

    predicted_orfs_df = pd.DataFrame()
    predicted_orfs_df['orf_id'] = orf_ids
    predicted_orfs_df['orf_sequence'] = orf_sequences

    msg = "Searching for matching peptides"
    logger.info(msg)

    peptide_matches = parallel.apply_parallel_split(peptide_sequences,
                                                    args.num_cpus,
                                                    find_matching_orfs_group,
                                                    predicted_orfs_df,
                                                    progress_bar=True,
                                                    num_groups=args.num_groups)

    # filter out the Nones to avoid DataFrame conversion problems
    msg = "Joining results back into large data frame"
    logger.info(msg)

    peptide_matches = [pm for pm in peptide_matches if pm is not None]
    peptide_matches = pd.concat(peptide_matches)

    # now, we have a data frame of matches (fields: peptide, orf_id)
    msg = "Getting peptide coverage of ORFs"
    logger.info(msg)

    # first, count the matches for each ORF
    peptide_matches_groups = peptide_matches.groupby('orf_id')

    orf_matches = parallel.apply_parallel_groups(peptide_matches_groups,
                                                 args.num_cpus,
                                                 count_matches,
                                                 progress_bar=True)
    orf_matches = pd.DataFrame(orf_matches)

    # then join back on the original list of ORFs to have entries for ORFs
    # with no peptide matches
    predicted_orf_coverage = pd.merge(predicted_orfs_df,
                                      orf_matches,
                                      on='orf_id',
                                      how="left")

    # and patch the holes in the data frame
    predicted_orf_coverage = predicted_orf_coverage.fillna(0)

    msg = "Writing coverage information to disk"
    utils.write_df(predicted_orf_coverage, args.out, index=False)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script uses the mygene.info service to find annotations "
        "for the transcripts associated with the ORFs in the given bed file. In "
        "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, "
        "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.")

    parser.add_argument('bed', help="The bed file")
    parser.add_argument('out', help="The output file. Its type will be inferred "
        "from its extension.")

    parser.add_argument('--do-not-trim', help="By default, the script will "
        "attempt to trim transcript identifiers such that they are valid Ensembl "
        "identifiers. If this flag is given, no trimming will take place.",
        action='store_true')

    parser.add_argument('--scopes', help="A list of scopes to use when querying "
        "mygene.info. Please see the documentation for more information about "
        "valid scopes: http://mygene.info/doc/query_service.html#available_fields",
        nargs='*', default=default_scopes)

    parser.add_argument('--do-not-convert-ids', help="By default, the script will "
        "treat the identifiers in the file as transcript identifiers. It first "
        "maps those to gene identifiers, and then it uses those to find the "
        "gene annotations. If the identifiers are already gene ids (or whatever "
        "is specified by scopes), then the first mapping is not necessary and "
        "can be skipped using this flag.", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    convert_ids = not args.do_not_convert_ids

    msg = "Reading the bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)
    bed = bed[fields_to_keep]

    msg = "Extracting transcript ids"
    logger.info(msg)
    trim = not args.do_not_trim
    orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim)
    orf_ids_df = pd.DataFrame(orf_ids)

    if convert_ids:
        msg = "Querying transcript to gene id mapping"
        logger.info(msg)
        gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id'])
    else:
        gene_ids = pd.DataFrame()
        gene_ids['transcript_id'] = orf_ids_df['transcript_id']
        gene_ids['gene_id'] = orf_ids_df['transcript_id']

    msg = "Querying gene annotations"
    logger.info(msg)
    res_df = mygene_utils.query_mygene(gene_ids['gene_id'])

    msg = "Combining gene annotations with transcript ids"
    logger.info(msg)
    res_df = gene_ids.merge(res_df, on='gene_id', how='inner')

    msg = "Combining transcript annotations with ORF ids"
    logger.info(msg)
    orf_ids_fields = ['transcript_id', 'orf_id']
    res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner')

    msg = "Combining ORF annotations with ORF predictions"
    logger.info(msg)
    res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left')

    msg = "Writing ORF annotations to disk"
    logger.info(msg)
    utils.write_df(res_df, args.out, index=False)
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Test models learned with train-as-auto-sklearn. It "
        "writes the predictions to disk as a \"long\" data frame. The output "
        "file is in gzipped csv format.")
    
    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('model_template', help="A template string for the filenames for "
        "the learned models. ${solver} and ${fold} are the template part of "
        "the string. It is probably necessary to surround this argument with "
        "single quotes in order to prevent shell replacement of the template "
        "parts.")

    parser.add_argument('out', help="The output csv file")

    parser.add_argument('--config', help="A (yaml) config file which "
        "specifies options controlling the learner behavior")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    msg = "Creating string templates"
    logger.info(msg)
    model_template = string.Template(args.model_template)

    msg = "Finding folds from ASlib scenario"
    logger.info(msg)
        
    folds = [int(i) for i in scenario.cv_data['fold'].unique()]
    folds = sorted(folds)

    msg = "Making predictions"
    logger.info(msg)

    all_predictions = []
    it = itertools.product(scenario.algorithms, folds)
    for solver, fold in it:
        
        model_file = model_template.substitute(solver=solver, fold=fold)
        
        if not os.path.exists(model_file):
            msg = "Could not find model file. Skipping: {}".format(model_file)
            logger.warning(msg)
            continue
        
        try:
            model = joblib.load(model_file)
        except:
            msg = ("Problem loading the model file. Skipping: {}".format(
                model_file))
            logger.warning(msg)
            continue
            
        msg = "Processing. solver: {}. fold: {}".format(solver, fold)
        logger.info(msg)
        
        testing, training = scenario.get_split(fold)
        y_pred = model.predict(testing.feature_data)

        if 'log_performance_data':
            # exp transform it back out
            y_pred = np.expm1(y_pred)
            
        pred_df = pd.DataFrame()
        pred_df['instance_id'] = testing.feature_data.index
        pred_df['solver'] = solver
        pred_df['fold'] = fold
        pred_df['actual'] = testing.performance_data[solver].values
        pred_df['predicted'] = y_pred
        
        all_predictions.append(pred_df)
        
    msg = "Joining all predictions in a long data frame"
    logger.info(msg)
    all_predictions = pd.concat(all_predictions)

    msg = "Writing predictions to disk"
    logger.info(msg)

    utils.write_df(all_predictions, args.out, index=False)
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Given a meme motif file, extract the gene names and map "
        "them to ensembl identifiers using pyensembl. The pyensembl database "
        "information can be given either in a yaml config file or as command "
        "line options. The yaml config file values have precedence over the "
        "command line options.")

    parser.add_argument('meme', help="The meme file")
    parser.add_argument('out', help="The output file")

    parser.add_argument(
        '-c',
        '--config',
        help="The yaml config file. If "
        "given, this should include keys 'genome_name' and 'gtf'. Otherwise, "
        "they may be specified using the respective command line options.",
        default=None)

    parser.add_argument('-n',
                        '--genome-name',
                        help="The genome_parameter for "
                        "retrieving the pyensembl database",
                        default=None)
    parser.add_argument('-g',
                        '--gtf',
                        help="The gtf file for pyensembl",
                        default=None)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # if the config file was given, use any values in it to replace those
    # passed on the command line
    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))

        args.genome_name = config.get('genome_name', args.genome_name)
        args.gtf = config.get('gtf', args.gtf)

    msg = "genome_name: {}".format(args.genome_name)
    logger.debug(msg)

    msg = "gtf: {}".format(args.gtf)
    logger.debug(msg)

    msg = "Loading pyensembl database"
    logger.info(msg)

    ensembl = pyensembl.Genome(reference_name=args.genome_name,
                               annotation_name="ensembl",
                               gtf_path_or_url=args.gtf)

    # this will create the database if needed
    ensembl.index()

    msg = "Parsing motif gene names"
    logger.info(msg)

    # a line from CISBP looks like:
    #   MOTIF M002_0.6 (Ankhd1)_(Homo_sapiens)_(RBD_1.00)

    all_motifs = []

    motif_re = ("\((?P<gene_name>[^\)]+)\)_\((?P<species>[^\)]+)\)_"
                "\((?P<rbd_score>[^\)]+)\)")
    motif_re = re.compile(motif_re)

    with open(args.meme) as meme_f:
        for line in meme_f:
            if line.startswith("MOTIF"):
                (key, motif_name, info) = line.split()

                m = motif_re.match(info)

                if m is None:
                    msg = ("Could not parse gene name. Guessing the entire "
                           "string is the gene name: '{}'.".format(info))
                    logger.warning(msg)
                    gene_name = info
                else:
                    gene_name = m.group("gene_name")

                try:
                    ensembl_ids = ensembl.gene_ids_of_gene_name(gene_name)
                except ValueError:
                    msg = ("Could not find Ensembl identifier for gene_name: "
                           "'{}'".format(gene_name))
                    logger.warning(msg)
                    ensembl_ids = [gene_name]

                for ensembl_id in ensembl_ids:
                    motif = {
                        "motif_name": motif_name,
                        "gene_name": gene_name,
                        "ensembl_id": ensembl_id
                    }

                    all_motifs.append(motif)

    msg = "Joining motif gene names into large data frame"
    logger.info(msg)
    all_motifs_df = pd.DataFrame(all_motifs)

    msg = "Writing motifs to disk"
    logger.info(msg)
    utils.write_df(all_motifs_df, args.out, index=False)