def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script collects counts of riboseq reads filtered at each step in " "the micropeptide prediction pipeline. It mostly parses fastqc results (using the " "crimson python package).") parser.add_argument('config', help="The yaml config file") parser.add_argument('out', help="The output csv file with the counts") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--overwrite', action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['samtools'] shell_utils.check_programs_exist(programs) config = yaml.load(open(args.config)) res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, get_counts, config, args) res = [r for r in res if r is not None] res_df = pd.DataFrame(res) utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script extracts all of the UTR information from a " "transcripts file created by gffread. It writes it into a csv file, so " "it can easily be imported and manipulated with pandas.") parser.add_argument('transcripts', help="A gffread-like fasta file") parser.add_argument('out', help="The (csv.gz) output file") utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) utr_info = gffread_utils.get_all_utrs(args.transcripts) utils.write_df(utr_info, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts the differential micropeptides from two " "conditions. Please see the documentation in redmine for more details.\n\n" "Please see the pyensembl (https://github.com/hammerlab/pyensembl) " "documentation for more information about the ensembl release and species." ) parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name_a', help="The name of the first condition") parser.add_argument('name_b', help="The name of the second condition") parser.add_argument('out', help="The output (.csv.gz or .xlsx) file") parser.add_argument( '-a', '--append-sheet', help="If this flag is given, " "then a worksheet with the name '<name_a>,<name_b>' will be appended " "to the .xlsx file given by out (if it exists)", action='store_true') parser.add_argument( '-f', '--filter', help="If this flag is present, then " "the output will be filtered to include only the differential " "micropeptides with the highest KL-divergence and read coverage", action='store_true') parser.add_argument( '--read-filter-percent', help="If the the --filter flag " "is given, then only the top --read-filter-percent micropeptides will " "be considered for the final output. They still must meet the KL-" "divergence filtering criteria.", type=float, default=default_read_filter_percent) parser.add_argument( '--kl-filter-percent', help="If the the --filter flag " "is given, then only the top --read-kl-percent micropeptides will " "be considered for the final output. They still must meet the read " "coverage filtering criteria.", type=float, default=default_kl_filter_percent) parser.add_argument( '--id-matches', help="This is a list of files which " "contain ORF identifiers to compare to the differential micropeptides. " "For each of the files given, two columns will be added to the output " "which indicate if either A or B appear in the respective file. Each " "file should have a single ORF identifier on each line and contain " "nothing else.", nargs='*', default=default_id_matches) parser.add_argument( '--id-match-names', help="A name to include in the " "output file for each --id-matches file. The number of names must " "match the number of files.", nargs='*', default=default_id_match_names) parser.add_argument( '--overlaps', help="This is a list of bed12+ files " "which will be compared to the differential micropeptides. Two columns " "(one for A, one for B) will be added to the output which indicate if " "the respective micropeptides overlap a feature in each file by at " "least 1 bp.", nargs='*', default=default_overlaps) parser.add_argument( '--overlap-names', help="A name to include in the " "output file for each --overlaps file. The number of names must match " "the number of files.", nargs='*', default=default_overlap_names) parser.add_argument( '-r', '--ensembl-release', help="The version of Ensembl " "to use when mapping transcript identifiers to gene identifiers", type=int, default=default_ensembl_release) parser.add_argument( '-s', '--ensembl-species', help="The Ensembl species " "to use when mapping transcript identifiers to gene identifiers", default=default_ensembl_species) parser.add_argument( '--a-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_a is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument( '--b-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_b is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--fields-to-keep', help="The fields to keep from the " "Bayes factor file for each condition", nargs='*', default=default_fields_to_keep) parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) " "length of ORFs considered as micropeptides", type=int, default=default_max_micropeptide_len) parser.add_argument( '--do-not-fix-tcons', help="By default, the \"TCONS_\" " "identifiers from StringTie, etc., do not parse correctly; this script " "update the identifiers so that will parse correctly unless instructed not " "to. The script is likely to crash if the identifiers are not fixed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ensembl database" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl.db msg = "Checking the id-match and overlaps files" logger.info(msg) if len(args.id_matches) != len(args.id_match_names): msg = ("The number of --id-matches files and --id-match-names do not " "match. {} files and {} names".format(len(args.id_matches), len(args.id_match_names))) raise ValueError(msg) if len(args.overlaps) != len(args.overlap_names): msg = ("The number of --overlaps files and --overlaps-names do not " "match. {} files and {} names".format(len(args.overlaps), len(args.overlap_names))) raise ValueError(msg) utils.check_files_exist(args.id_matches) utils.check_files_exist(args.overlaps) if args.filter: msg = "Validating filter percentages" logger.info(msg) math_utils.check_range(args.read_filter_percent, 0, 1, variable_name="--read-filter-percent") math_utils.check_range(args.kl_filter_percent, 0, 1, variable_name="--kl-filter-percent") msg = "Extracting file names" logger.info(msg) config = yaml.load(open(args.config)) note_str = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) lengths_a = None offsets_a = None if args.a_is_single_sample: lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_a, is_unique=is_unique) bayes_factors_a = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_a): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_a, bayes_factors_a)) raise FileNotFoundError(msg) predicted_orfs_a = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_a): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_a, predicted_orfs_a)) raise FileNotFoundError(msg) lengths_b = None offsets_b = None if args.b_is_single_sample: lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_b, is_unique=is_unique) bayes_factors_b = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_b): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_b, bayes_factors_b)) raise FileNotFoundError(msg) predicted_orfs_b = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_b): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_b, predicted_orfs_b)) raise FileNotFoundError(msg) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) if not os.path.exists(exons_file): msg = "Could not find the exons file ({}). Quitting.".format( exons_file) raise FileNotFoundError(msg) msg = "Reading the exons" logger.info(msg) exons = bed_utils.read_bed(exons_file) msg = "Reading the BF files" logger.info(msg) bf_df_a = bed_utils.read_bed(bayes_factors_a) bf_df_b = bed_utils.read_bed(bayes_factors_b) msg = "Reading the predictions files" logger.info(msg) bed_df_a = bed_utils.read_bed(predicted_orfs_a) bed_df_b = bed_utils.read_bed(predicted_orfs_b) differential_micropeptide_dfs = [] # extract micropeptides msg = "Extracting micropeptides" logger.info(msg) m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len micropeptides_a = bed_df_a[m_micropeptides_a] micropeptides_b = bed_df_b[m_micropeptides_b] long_orfs_a = bed_df_a[~m_micropeptides_a] long_orfs_b = bed_df_b[~m_micropeptides_b] msg = "Finding micropeptides in A with no overlap in B" logger.info(msg) micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons) micropeptides_a_no_match_b_df = pd.DataFrame() micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b) micropeptides_a_no_match_b_df['B'] = None micropeptides_a_no_match_b_df['kl'] = np.inf micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only' differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df) msg = "Finding micropeptides in B with no overlap in A" logger.info(msg) micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons) micropeptides_b_no_match_a_df = pd.DataFrame() micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a) micropeptides_b_no_match_a_df['A'] = None micropeptides_b_no_match_a_df['kl'] = np.inf micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only' differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df) msg = "Finding overlapping micropeptides" logger.info(msg) micropeptides_a_micropeptides_b_df = get_overlap_df( micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df) micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 'micro_a_long_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_long_b_df) micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 'long_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_b_long_a_df) differential_micropeptides_df = pd.concat(differential_micropeptide_dfs) msg = "Adding read count information" logger.info(msg) res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], left_on='A', right_on='id', how='left') to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_A', axis=1) res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left') to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_B', axis=1) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) if not args.do_not_fix_tcons: # replace TCONS_ with TCONS res['A'] = res['A'].str.replace("TCONS_", "TCONS") res['B'] = res['B'].str.replace("TCONS_", "TCONS") msg = "Extracting the genes and their biotypes using pyensembl" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl_transcript_ids = set(ensembl.transcript_ids()) biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A', ensembl, ensembl_transcript_ids) biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B', ensembl, ensembl_transcript_ids) biotypes_a = utils.remove_nones(biotypes_a) biotypes_b = utils.remove_nones(biotypes_b) biotypes_a = pd.DataFrame(biotypes_a) biotypes_b = pd.DataFrame(biotypes_b) res = res.merge(biotypes_a, on='A', how='left') res = res.merge(biotypes_b, on='B', how='left') msg = "Pulling annotations from mygene.info" logger.info(msg) # pull annotations from mygene gene_info_a = mygene_utils.query_mygene(res['gene_id_A']) gene_info_b = mygene_utils.query_mygene(res['gene_id_B']) # and add the mygene info res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', how='left') to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) res = res.merge(gene_info_b, left_on='gene_id_B', right_on='gene_id', how='left') to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) msg = "Removing duplicates" logger.info(msg) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) msg = "Adding --id-matches columns" logger.info(msg) for (id_match_file, name) in zip(args.id_matches, args.id_match_names): res = add_id_matches(res, id_match_file, name) msg = "Adding --overlaps columns" logger.info(msg) for (overlap_file, name) in zip(args.overlaps, args.overlap_names): res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons) msg = "Sorting by in-frame reads" logger.info(msg) res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0) res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0) res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B'] res = res.sort_values('x_1_sum', ascending=False) if args.filter: msg = "Filtering the micropeptides by read coverage and KL-divergence" logger.info(msg) x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', ascending=False) num_x_1_sum_ranks = x_1_sum_ranks.max() max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank msg = ("Number of micropeptides passing read filter: {}".format( sum(m_good_x_1_sum_rank))) logger.debug(msg) kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False) num_kl_ranks = kl_ranks.max() max_good_kl_rank = num_kl_ranks * args.kl_filter_percent m_good_kl_rank = kl_ranks <= max_good_kl_rank msg = ("Number of micropeptides passing KL filter: {}".format( sum(m_good_kl_rank))) logger.debug(msg) m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank msg = ("Number of micropeptides passing both filters: {}".format( sum(m_both_filters))) logger.debug(msg) res = res[m_both_filters] msg = "Writing differential micropeptides to disk" logger.info(msg) if args.append_sheet is None: utils.write_df(res, args.out, index=False) else: sheet_name = "{},{}".format(args.name_a, args.name_b) utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script removes all of the peptides which match to multiple " "ORFs from the results found with get-all-orf-peptide-matches.") parser.add_argument('peptide_matches', help="The peptide matches file produced " "by get-all-orf-peptide-matches") parser.add_argument('out', help="A similar peptide matches file which " "contains only peptides which match to a unique ORF") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading peptide matches" logger.info(msg) peptide_matches = pd.read_csv(args.peptide_matches) msg = "Splitting the grouped matches into individual peptide matches" logger.info(msg) matches = parallel.apply_parallel( peptide_matches, args.num_cpus, parse_matches, progress_bar=True) msg = "Removing peptides which match to multiple ORFs" logger.info(msg) matches = utils.remove_nones(matches) matches = utils.flatten_lists(matches) matches_df = pd.DataFrame(matches) unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False) msg = "Merging the ORF-peptide matches back to single records" logger.info(msg) unique_groups = unique_matches_df.groupby('orf_id') merged_unique_groups = parallel.apply_parallel_groups( unique_groups, args.num_cpus, merge_group, progress_bar=True) merged_unique_df = pd.DataFrame(merged_unique_groups) msg = "Re-adding the ORFs which no longer have peptide matches" logger.info(msg) m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id']) peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0 peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0 peps = [merged_unique_df, peptide_matches[~m_still_has_match]] merged_unique_df = pd.concat(peps) msg = "Writing the ORFs with unique matches to disk" logger.info(msg) utils.write_df(merged_unique_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script uses the peptides.txt file from MaxQuant to determine " "which predicted ORFs have some proteomics evidence.\n\nIt contains " "some hard-coded field names.") parser.add_argument('predicted_proteins', help="The (fasta, protein) file of " "predicted ORFs") parser.add_argument('peptides', help="The peptides.txt file produced by MaxQuant") parser.add_argument( 'out', help="The output (csv.gz) file containing the predicted " "ORFs and their coverage") parser.add_argument('--num-cpus', help="The number of CPUs to use for searching", type=int, default=default_num_cpus) parser.add_argument('--peptide-filter-field', help="The field to use for filtering " "the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value greater " "than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the --peptide file", default=default_peptide_separator) parser.add_argument( '-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) parser.add_argument( '--num-peptides', help="If n>0, then only the first n peptide " "sequences will be used to calculate coverage. This is for testing.", type=int, default=default_num_peptides) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[get-orf-peptide-matches]: {}".format(' '.join(sys.argv)) logger.info(msg) msg = "Reading and filtering peptides" logger.info(msg) peptides = pd.read_csv(args.peptides, sep=args.peptide_separator) mask_filter = peptides[ args.peptide_filter_field] < args.peptide_filter_value peptides = peptides[mask_filter] peptide_sequences = pd.DataFrame(peptides['Sequence']) if args.num_peptides > 0: peptide_sequences = peptide_sequences.head(args.num_peptides) msg = "Number of filtered peptides: {}".format(len(peptide_sequences)) logger.info(msg) msg = "Reading predicted ORFs into a data frame" logger.info(msg) # TODO: use read iterator predicted_orfs = bio.get_read_iterator(args.predicted_proteins) orf_ids = [] orf_sequences = [] for orf_id, seq in predicted_orfs: orf_ids.append(orf_id) orf_sequences.append(seq) predicted_orfs_df = pd.DataFrame() predicted_orfs_df['orf_id'] = orf_ids predicted_orfs_df['orf_sequence'] = orf_sequences msg = "Searching for matching peptides" logger.info(msg) peptide_matches = parallel.apply_parallel_split(peptide_sequences, args.num_cpus, find_matching_orfs_group, predicted_orfs_df, progress_bar=True, num_groups=args.num_groups) # filter out the Nones to avoid DataFrame conversion problems msg = "Joining results back into large data frame" logger.info(msg) peptide_matches = [pm for pm in peptide_matches if pm is not None] peptide_matches = pd.concat(peptide_matches) # now, we have a data frame of matches (fields: peptide, orf_id) msg = "Getting peptide coverage of ORFs" logger.info(msg) # first, count the matches for each ORF peptide_matches_groups = peptide_matches.groupby('orf_id') orf_matches = parallel.apply_parallel_groups(peptide_matches_groups, args.num_cpus, count_matches, progress_bar=True) orf_matches = pd.DataFrame(orf_matches) # then join back on the original list of ORFs to have entries for ORFs # with no peptide matches predicted_orf_coverage = pd.merge(predicted_orfs_df, orf_matches, on='orf_id', how="left") # and patch the holes in the data frame predicted_orf_coverage = predicted_orf_coverage.fillna(0) msg = "Writing coverage information to disk" utils.write_df(predicted_orf_coverage, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script uses the mygene.info service to find annotations " "for the transcripts associated with the ORFs in the given bed file. In " "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, " "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.") parser.add_argument('bed', help="The bed file") parser.add_argument('out', help="The output file. Its type will be inferred " "from its extension.") parser.add_argument('--do-not-trim', help="By default, the script will " "attempt to trim transcript identifiers such that they are valid Ensembl " "identifiers. If this flag is given, no trimming will take place.", action='store_true') parser.add_argument('--scopes', help="A list of scopes to use when querying " "mygene.info. Please see the documentation for more information about " "valid scopes: http://mygene.info/doc/query_service.html#available_fields", nargs='*', default=default_scopes) parser.add_argument('--do-not-convert-ids', help="By default, the script will " "treat the identifiers in the file as transcript identifiers. It first " "maps those to gene identifiers, and then it uses those to find the " "gene annotations. If the identifiers are already gene ids (or whatever " "is specified by scopes), then the first mapping is not necessary and " "can be skipped using this flag.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) convert_ids = not args.do_not_convert_ids msg = "Reading the bed file" logger.info(msg) bed = bed_utils.read_bed(args.bed) bed = bed[fields_to_keep] msg = "Extracting transcript ids" logger.info(msg) trim = not args.do_not_trim orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim) orf_ids_df = pd.DataFrame(orf_ids) if convert_ids: msg = "Querying transcript to gene id mapping" logger.info(msg) gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id']) else: gene_ids = pd.DataFrame() gene_ids['transcript_id'] = orf_ids_df['transcript_id'] gene_ids['gene_id'] = orf_ids_df['transcript_id'] msg = "Querying gene annotations" logger.info(msg) res_df = mygene_utils.query_mygene(gene_ids['gene_id']) msg = "Combining gene annotations with transcript ids" logger.info(msg) res_df = gene_ids.merge(res_df, on='gene_id', how='inner') msg = "Combining transcript annotations with ORF ids" logger.info(msg) orf_ids_fields = ['transcript_id', 'orf_id'] res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner') msg = "Combining ORF annotations with ORF predictions" logger.info(msg) res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left') msg = "Writing ORF annotations to disk" logger.info(msg) utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Test models learned with train-as-auto-sklearn. It " "writes the predictions to disk as a \"long\" data frame. The output " "file is in gzipped csv format.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('model_template', help="A template string for the filenames for " "the learned models. ${solver} and ${fold} are the template part of " "the string. It is probably necessary to surround this argument with " "single quotes in order to prevent shell replacement of the template " "parts.") parser.add_argument('out', help="The output csv file") parser.add_argument('--config', help="A (yaml) config file which " "specifies options controlling the learner behavior") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) if args.config is not None: msg = "Loading yaml config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} msg = "Creating string templates" logger.info(msg) model_template = string.Template(args.model_template) msg = "Finding folds from ASlib scenario" logger.info(msg) folds = [int(i) for i in scenario.cv_data['fold'].unique()] folds = sorted(folds) msg = "Making predictions" logger.info(msg) all_predictions = [] it = itertools.product(scenario.algorithms, folds) for solver, fold in it: model_file = model_template.substitute(solver=solver, fold=fold) if not os.path.exists(model_file): msg = "Could not find model file. Skipping: {}".format(model_file) logger.warning(msg) continue try: model = joblib.load(model_file) except: msg = ("Problem loading the model file. Skipping: {}".format( model_file)) logger.warning(msg) continue msg = "Processing. solver: {}. fold: {}".format(solver, fold) logger.info(msg) testing, training = scenario.get_split(fold) y_pred = model.predict(testing.feature_data) if 'log_performance_data': # exp transform it back out y_pred = np.expm1(y_pred) pred_df = pd.DataFrame() pred_df['instance_id'] = testing.feature_data.index pred_df['solver'] = solver pred_df['fold'] = fold pred_df['actual'] = testing.performance_data[solver].values pred_df['predicted'] = y_pred all_predictions.append(pred_df) msg = "Joining all predictions in a long data frame" logger.info(msg) all_predictions = pd.concat(all_predictions) msg = "Writing predictions to disk" logger.info(msg) utils.write_df(all_predictions, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Given a meme motif file, extract the gene names and map " "them to ensembl identifiers using pyensembl. The pyensembl database " "information can be given either in a yaml config file or as command " "line options. The yaml config file values have precedence over the " "command line options.") parser.add_argument('meme', help="The meme file") parser.add_argument('out', help="The output file") parser.add_argument( '-c', '--config', help="The yaml config file. If " "given, this should include keys 'genome_name' and 'gtf'. Otherwise, " "they may be specified using the respective command line options.", default=None) parser.add_argument('-n', '--genome-name', help="The genome_parameter for " "retrieving the pyensembl database", default=None) parser.add_argument('-g', '--gtf', help="The gtf file for pyensembl", default=None) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # if the config file was given, use any values in it to replace those # passed on the command line if args.config is not None: msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) args.genome_name = config.get('genome_name', args.genome_name) args.gtf = config.get('gtf', args.gtf) msg = "genome_name: {}".format(args.genome_name) logger.debug(msg) msg = "gtf: {}".format(args.gtf) logger.debug(msg) msg = "Loading pyensembl database" logger.info(msg) ensembl = pyensembl.Genome(reference_name=args.genome_name, annotation_name="ensembl", gtf_path_or_url=args.gtf) # this will create the database if needed ensembl.index() msg = "Parsing motif gene names" logger.info(msg) # a line from CISBP looks like: # MOTIF M002_0.6 (Ankhd1)_(Homo_sapiens)_(RBD_1.00) all_motifs = [] motif_re = ("\((?P<gene_name>[^\)]+)\)_\((?P<species>[^\)]+)\)_" "\((?P<rbd_score>[^\)]+)\)") motif_re = re.compile(motif_re) with open(args.meme) as meme_f: for line in meme_f: if line.startswith("MOTIF"): (key, motif_name, info) = line.split() m = motif_re.match(info) if m is None: msg = ("Could not parse gene name. Guessing the entire " "string is the gene name: '{}'.".format(info)) logger.warning(msg) gene_name = info else: gene_name = m.group("gene_name") try: ensembl_ids = ensembl.gene_ids_of_gene_name(gene_name) except ValueError: msg = ("Could not find Ensembl identifier for gene_name: " "'{}'".format(gene_name)) logger.warning(msg) ensembl_ids = [gene_name] for ensembl_id in ensembl_ids: motif = { "motif_name": motif_name, "gene_name": gene_name, "ensembl_id": ensembl_id } all_motifs.append(motif) msg = "Joining motif gene names into large data frame" logger.info(msg) all_motifs_df = pd.DataFrame(all_motifs) msg = "Writing motifs to disk" logger.info(msg) utils.write_df(all_motifs_df, args.out, index=False)