def create_solver_schedules(self, scenario): """ Use the fit pipeline to create solver schedules for the scenario instances """ check_is_fitted(self, ["pipeline_"]) # currently, just take the predicted best solver X_test = scenario.feature_data y_test = scenario.performance_data X_test = X_test.values y_pred = self.predict(X_test) msg = "X_test.shape: {}. y_pred.shape: {}".format( X_test.shape, y_pred.shape) logger.debug(msg) choices = self.inverse_transform(y_pred) it = zip(choices, scenario.instances) schedules = {} for choice, instance in it: if scenario.performance_type[0] == "runtime": solver_schedule = self.feature_steps + [[ choice, scenario.algorithm_cutoff_time ]] elif scenario.performance_type[0] == "solution_quality": solver_schedule = [[choice, 999999999999]] schedule = utils.remove_nones(solver_schedule) schedules[instance] = schedule return schedules
def get_orf_positions(seq, start_codons_re, stop_codons_re): """ This function extracts the relative position of all ORFs from the given sequence. It assumes the sequence does not include any whitespace, and that the regular expressions properly identify start and stop codons. For example, if seq has already been transcribed, then the start codon should be "AUG". N.B. The ORFs *include* the first base in the start codon (e.g., "A" in "ATG") Args: seq (string) : the (untranslated) sequence start_codons_re, stop_codons_re (compiled regular expression): regular expressions which identify start and stop codons, respectively Returns: list of orf_positions (a named 2-tuple with "start" and "end" fields) Example usage: orfs = get_orfs(seq, start_codons_re, stop_codons_re) first_orf_start = orf_starts[0, 0] first_orf_end = orf_ends[0, 1] """ # these give the positions of the start_pos = np.array([m.start() for m in start_codons_re.finditer(seq)]) stop_pos = np.array([m.start() for m in stop_codons_re.finditer(seq)]) # pull out the matching ends for each start orfs = [get_matching_stop_position(s, stop_pos) for s in start_pos] orfs = utils.remove_nones(orfs) return orfs
def refit(self, scenario): """ Update the parameters, but not hyperparameters, of the ensemble and other pipeline members using the new training data. N.B. This starts training from scratch. It does not use "warm starts". Also, due to some technical details, some members of the ensembles fit with auto-sklearn cannot be refit. Those are discarded. Parameters ---------- scenario: an ASlibScenario Presumably, this is something like a (different) cv split of the data originally used to train the pipeline. Returns ------- self """ check_is_fitted(self, "pipeline_") # overwrite whatever training data we had self.X_train = scenario.feature_data.values self.y_train = scenario.performance_data # we can just overwrite most of the existing pipeline feature_selector = self.pipeline_.named_steps['feature_selector'] nss = self.pipeline_.named_steps['nss'] imputer = self.pipeline_.named_steps.get('imputer') selector = self.pipeline_.named_steps['selector'] # none of the preprocessing has tunable hyperparameters # we do not always use an imputer, though i = None if imputer is not None: i = ('imputer', imputer) p = utils.remove_nones([('feature_selector', feature_selector), ('nss', copy.deepcopy(nss)), i]) # fit the first part of the pipeline to transform the training data p = sklearn.pipeline.Pipeline(p) p_fit = p.fit(self.X_train, self.y_train) # now, transform our data so we can send it to the ensemble X_tr = p_fit.transform(self.X_train) selector_refit = selector.refit(X_tr, self.y_train) # finally, reconstruct our refit pipeline p_fit = p_fit.steps p_fit.append(("selector", selector_refit)) self.pipeline_ = sklearn.pipeline.Pipeline(p_fit) return self
def create_schedules(self, scenario): """ Create the algorithm selection schedules for all instances in the scenario """ presolver_schedules = self.presolver_scheduler_.create_presolver_schedule( scenario) solver_schedules = self.pipeline_.create_solver_schedules(scenario) schedules = {} for instance in scenario.instances: schedule = utils.remove_nones(presolver_schedules[instance] + solver_schedules[instance]) schedules[instance] = schedule return schedules
def fit(self, scenario): """ Fit the pipeline using the ASlibScenario """ if self.features is None: self.feature_columns_ = len(scenario.feature_data.columns) self.feature_columns_ = np.arange(self.feature_columns_, dtype=int) else: self.feature_columns_ = [ scenario.feature_data.columns.get_loc(c) for c in self.features ] feature_selector = mlxtend.feature_selection.ColumnSelector( cols=self.feature_columns_) nss = NaNStandardScaler() as_asl_ensemble = ASaslEnsemble( args=self.args, solvers=scenario.algorithms, use_random_forests=self.use_random_forests) # if we are using random forests, then we must also impute # missing values imputer = None if self.use_random_forests: imputer = automl_utils.get_imputer("zero_fill") imputer = ('imputer', imputer) pipeline = utils.remove_nones([('feature_selector', feature_selector), ('nss', nss), imputer, ('selector', as_asl_ensemble)]) self.pipeline = sklearn.pipeline.Pipeline(pipeline) self.X_train = scenario.feature_data.values self.y_train = scenario.performance_data self.pipeline_ = self.pipeline.fit(self.X_train, self.y_train) return self
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts the differential micropeptides from two " "conditions. Please see the documentation in redmine for more details.\n\n" "Please see the pyensembl (https://github.com/hammerlab/pyensembl) " "documentation for more information about the ensembl release and species." ) parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name_a', help="The name of the first condition") parser.add_argument('name_b', help="The name of the second condition") parser.add_argument('out', help="The output (.csv.gz or .xlsx) file") parser.add_argument( '-a', '--append-sheet', help="If this flag is given, " "then a worksheet with the name '<name_a>,<name_b>' will be appended " "to the .xlsx file given by out (if it exists)", action='store_true') parser.add_argument( '-f', '--filter', help="If this flag is present, then " "the output will be filtered to include only the differential " "micropeptides with the highest KL-divergence and read coverage", action='store_true') parser.add_argument( '--read-filter-percent', help="If the the --filter flag " "is given, then only the top --read-filter-percent micropeptides will " "be considered for the final output. They still must meet the KL-" "divergence filtering criteria.", type=float, default=default_read_filter_percent) parser.add_argument( '--kl-filter-percent', help="If the the --filter flag " "is given, then only the top --read-kl-percent micropeptides will " "be considered for the final output. They still must meet the read " "coverage filtering criteria.", type=float, default=default_kl_filter_percent) parser.add_argument( '--id-matches', help="This is a list of files which " "contain ORF identifiers to compare to the differential micropeptides. " "For each of the files given, two columns will be added to the output " "which indicate if either A or B appear in the respective file. Each " "file should have a single ORF identifier on each line and contain " "nothing else.", nargs='*', default=default_id_matches) parser.add_argument( '--id-match-names', help="A name to include in the " "output file for each --id-matches file. The number of names must " "match the number of files.", nargs='*', default=default_id_match_names) parser.add_argument( '--overlaps', help="This is a list of bed12+ files " "which will be compared to the differential micropeptides. Two columns " "(one for A, one for B) will be added to the output which indicate if " "the respective micropeptides overlap a feature in each file by at " "least 1 bp.", nargs='*', default=default_overlaps) parser.add_argument( '--overlap-names', help="A name to include in the " "output file for each --overlaps file. The number of names must match " "the number of files.", nargs='*', default=default_overlap_names) parser.add_argument( '-r', '--ensembl-release', help="The version of Ensembl " "to use when mapping transcript identifiers to gene identifiers", type=int, default=default_ensembl_release) parser.add_argument( '-s', '--ensembl-species', help="The Ensembl species " "to use when mapping transcript identifiers to gene identifiers", default=default_ensembl_species) parser.add_argument( '--a-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_a is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument( '--b-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_b is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--fields-to-keep', help="The fields to keep from the " "Bayes factor file for each condition", nargs='*', default=default_fields_to_keep) parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) " "length of ORFs considered as micropeptides", type=int, default=default_max_micropeptide_len) parser.add_argument( '--do-not-fix-tcons', help="By default, the \"TCONS_\" " "identifiers from StringTie, etc., do not parse correctly; this script " "update the identifiers so that will parse correctly unless instructed not " "to. The script is likely to crash if the identifiers are not fixed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ensembl database" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl.db msg = "Checking the id-match and overlaps files" logger.info(msg) if len(args.id_matches) != len(args.id_match_names): msg = ("The number of --id-matches files and --id-match-names do not " "match. {} files and {} names".format(len(args.id_matches), len(args.id_match_names))) raise ValueError(msg) if len(args.overlaps) != len(args.overlap_names): msg = ("The number of --overlaps files and --overlaps-names do not " "match. {} files and {} names".format(len(args.overlaps), len(args.overlap_names))) raise ValueError(msg) utils.check_files_exist(args.id_matches) utils.check_files_exist(args.overlaps) if args.filter: msg = "Validating filter percentages" logger.info(msg) math_utils.check_range(args.read_filter_percent, 0, 1, variable_name="--read-filter-percent") math_utils.check_range(args.kl_filter_percent, 0, 1, variable_name="--kl-filter-percent") msg = "Extracting file names" logger.info(msg) config = yaml.load(open(args.config)) note_str = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) lengths_a = None offsets_a = None if args.a_is_single_sample: lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_a, is_unique=is_unique) bayes_factors_a = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_a): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_a, bayes_factors_a)) raise FileNotFoundError(msg) predicted_orfs_a = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_a): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_a, predicted_orfs_a)) raise FileNotFoundError(msg) lengths_b = None offsets_b = None if args.b_is_single_sample: lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_b, is_unique=is_unique) bayes_factors_b = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_b): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_b, bayes_factors_b)) raise FileNotFoundError(msg) predicted_orfs_b = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_b): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_b, predicted_orfs_b)) raise FileNotFoundError(msg) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) if not os.path.exists(exons_file): msg = "Could not find the exons file ({}). Quitting.".format( exons_file) raise FileNotFoundError(msg) msg = "Reading the exons" logger.info(msg) exons = bed_utils.read_bed(exons_file) msg = "Reading the BF files" logger.info(msg) bf_df_a = bed_utils.read_bed(bayes_factors_a) bf_df_b = bed_utils.read_bed(bayes_factors_b) msg = "Reading the predictions files" logger.info(msg) bed_df_a = bed_utils.read_bed(predicted_orfs_a) bed_df_b = bed_utils.read_bed(predicted_orfs_b) differential_micropeptide_dfs = [] # extract micropeptides msg = "Extracting micropeptides" logger.info(msg) m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len micropeptides_a = bed_df_a[m_micropeptides_a] micropeptides_b = bed_df_b[m_micropeptides_b] long_orfs_a = bed_df_a[~m_micropeptides_a] long_orfs_b = bed_df_b[~m_micropeptides_b] msg = "Finding micropeptides in A with no overlap in B" logger.info(msg) micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons) micropeptides_a_no_match_b_df = pd.DataFrame() micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b) micropeptides_a_no_match_b_df['B'] = None micropeptides_a_no_match_b_df['kl'] = np.inf micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only' differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df) msg = "Finding micropeptides in B with no overlap in A" logger.info(msg) micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons) micropeptides_b_no_match_a_df = pd.DataFrame() micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a) micropeptides_b_no_match_a_df['A'] = None micropeptides_b_no_match_a_df['kl'] = np.inf micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only' differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df) msg = "Finding overlapping micropeptides" logger.info(msg) micropeptides_a_micropeptides_b_df = get_overlap_df( micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df) micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 'micro_a_long_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_long_b_df) micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 'long_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_b_long_a_df) differential_micropeptides_df = pd.concat(differential_micropeptide_dfs) msg = "Adding read count information" logger.info(msg) res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], left_on='A', right_on='id', how='left') to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_A', axis=1) res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left') to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_B', axis=1) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) if not args.do_not_fix_tcons: # replace TCONS_ with TCONS res['A'] = res['A'].str.replace("TCONS_", "TCONS") res['B'] = res['B'].str.replace("TCONS_", "TCONS") msg = "Extracting the genes and their biotypes using pyensembl" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl_transcript_ids = set(ensembl.transcript_ids()) biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A', ensembl, ensembl_transcript_ids) biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B', ensembl, ensembl_transcript_ids) biotypes_a = utils.remove_nones(biotypes_a) biotypes_b = utils.remove_nones(biotypes_b) biotypes_a = pd.DataFrame(biotypes_a) biotypes_b = pd.DataFrame(biotypes_b) res = res.merge(biotypes_a, on='A', how='left') res = res.merge(biotypes_b, on='B', how='left') msg = "Pulling annotations from mygene.info" logger.info(msg) # pull annotations from mygene gene_info_a = mygene_utils.query_mygene(res['gene_id_A']) gene_info_b = mygene_utils.query_mygene(res['gene_id_B']) # and add the mygene info res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', how='left') to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) res = res.merge(gene_info_b, left_on='gene_id_B', right_on='gene_id', how='left') to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) msg = "Removing duplicates" logger.info(msg) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) msg = "Adding --id-matches columns" logger.info(msg) for (id_match_file, name) in zip(args.id_matches, args.id_match_names): res = add_id_matches(res, id_match_file, name) msg = "Adding --overlaps columns" logger.info(msg) for (overlap_file, name) in zip(args.overlaps, args.overlap_names): res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons) msg = "Sorting by in-frame reads" logger.info(msg) res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0) res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0) res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B'] res = res.sort_values('x_1_sum', ascending=False) if args.filter: msg = "Filtering the micropeptides by read coverage and KL-divergence" logger.info(msg) x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', ascending=False) num_x_1_sum_ranks = x_1_sum_ranks.max() max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank msg = ("Number of micropeptides passing read filter: {}".format( sum(m_good_x_1_sum_rank))) logger.debug(msg) kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False) num_kl_ranks = kl_ranks.max() max_good_kl_rank = num_kl_ranks * args.kl_filter_percent m_good_kl_rank = kl_ranks <= max_good_kl_rank msg = ("Number of micropeptides passing KL filter: {}".format( sum(m_good_kl_rank))) logger.debug(msg) m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank msg = ("Number of micropeptides passing both filters: {}".format( sum(m_both_filters))) logger.debug(msg) res = res[m_both_filters] msg = "Writing differential micropeptides to disk" logger.info(msg) if args.append_sheet is None: utils.write_df(res, args.out, index=False) else: sheet_name = "{},{}".format(args.name_a, args.name_b) utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script estimates the Bayes factors for all metagene profiles in the " "given file. The script accepts as input multiple \"periodic\" and \"nonperiodic\" " "models. It uses the models of each type with the best mean to estimate the Bayes " "factor distributions.\n\nIt contains some hard-coded field names.") parser.add_argument('metagene_profiles', help="The (csv) file containing the metagene profiles") parser.add_argument('out', help="The output (csv.gz) file") parser.add_argument( '--periodic-models', help="A list of pickled StanModel files which contain " "models that somehow represent periodic metagene profiles", nargs="+", default=default_periodic_models) parser.add_argument( '--nonperiodic-models', help="A list of pickled StanModel files which contain " "models that somehow represent nonperiodic metagene profiles", nargs="+", default=default_nonperiodic_models) parser.add_argument( '--periodic-offset-start', help="The position, relative to the translation " "initiation site, to begin calculating periodicity Bayes factors (inclusive)", type=int, default=default_periodic_offset_start) parser.add_argument( '--periodic-offset-end', help="The position, relative to the translation " "initiation site, to stop calculating periodicity Bayes factors (inclusive)", type=int, default=default_periodic_offset_end) parser.add_argument( '--metagene-profile-length', help="The length of the profile to use in the " "models. metagene_profile_length + periodic_offset_end must be consistent with the length " "of the extracted metagene profile. The length must be divisible by three.", type=int, default=default_metagene_profile_length) parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=default_seed) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=default_chains) parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for " "each chain", type=int, default=default_iterations) parser.add_argument( '-p', '--num-cpus', help="The number of CPUs to use. Each read " "length will be processed in its own thread (so that is the maximum number of CPUs " "that is useful).", type=int, default=default_num_cpus) parser.add_argument('--type-field', default=default_type_field) parser.add_argument('--count-field', default=default_count_field) parser.add_argument('--position-field', default=default_position_field) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # we will parallelize based on the lengths. So we need to know which lengths # are present in the metagene profiles file metagene_profiles = pd.read_csv(args.metagene_profiles) lengths = list(metagene_profiles['length'].unique()) length_str = ','.join(str(int(l)) for l in lengths) msg = "Estimating Bayes factors for lengths: {}".format(length_str) logger.info(msg) length_groups = metagene_profiles.groupby('length') with suppress_stdout_stderr(): all_profile_estimates_df = parallel.apply_parallel_groups( length_groups, args.num_cpus, estimate_profile_bayes_factors, args, progress_bar=True) msg = "Combining estimates into one data frame" logger.info(msg) all_profile_estimates_df = utils.remove_nones(all_profile_estimates_df) all_profile_estimates_df = pd.concat(all_profile_estimates_df) pandas_utils.write_df(all_profile_estimates_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script removes all of the peptides which match to multiple " "ORFs from the results found with get-all-orf-peptide-matches.") parser.add_argument('peptide_matches', help="The peptide matches file produced " "by get-all-orf-peptide-matches") parser.add_argument('out', help="A similar peptide matches file which " "contains only peptides which match to a unique ORF") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading peptide matches" logger.info(msg) peptide_matches = pd.read_csv(args.peptide_matches) msg = "Splitting the grouped matches into individual peptide matches" logger.info(msg) matches = parallel.apply_parallel( peptide_matches, args.num_cpus, parse_matches, progress_bar=True) msg = "Removing peptides which match to multiple ORFs" logger.info(msg) matches = utils.remove_nones(matches) matches = utils.flatten_lists(matches) matches_df = pd.DataFrame(matches) unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False) msg = "Merging the ORF-peptide matches back to single records" logger.info(msg) unique_groups = unique_matches_df.groupby('orf_id') merged_unique_groups = parallel.apply_parallel_groups( unique_groups, args.num_cpus, merge_group, progress_bar=True) merged_unique_df = pd.DataFrame(merged_unique_groups) msg = "Re-adding the ORFs which no longer have peptide matches" logger.info(msg) m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id']) peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0 peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0 peps = [merged_unique_df, peptide_matches[~m_still_has_match]] merged_unique_df = pd.concat(peps) msg = "Writing the ORFs with unique matches to disk" logger.info(msg) utils.write_df(merged_unique_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script extracts the differential micropeptides from two " "conditions. Please see the documentation in redmine for more details.\n\n" "Please see the pyensembl (https://github.com/hammerlab/pyensembl) " "documentation for more information about the ensembl release and species.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name_a', help="The name of the first condition") parser.add_argument('name_b', help="The name of the second condition") parser.add_argument('out', help="The output (.csv.gz or .xlsx) file") parser.add_argument('-a', '--append-sheet', help="If this flag is given, " "then a worksheet with the name '<name_a>,<name_b>' will be appended " "to the .xlsx file given by out (if it exists)", action='store_true') parser.add_argument('-f', '--filter', help="If this flag is present, then " "the output will be filtered to include only the differential " "micropeptides with the highest KL-divergence and read coverage", action='store_true') parser.add_argument('--read-filter-percent', help="If the the --filter flag " "is given, then only the top --read-filter-percent micropeptides will " "be considered for the final output. They still must meet the KL-" "divergence filtering criteria.", type=float, default=default_read_filter_percent) parser.add_argument('--kl-filter-percent', help="If the the --filter flag " "is given, then only the top --read-kl-percent micropeptides will " "be considered for the final output. They still must meet the read " "coverage filtering criteria.", type=float, default=default_kl_filter_percent) parser.add_argument('--id-matches', help="This is a list of files which " "contain ORF identifiers to compare to the differential micropeptides. " "For each of the files given, two columns will be added to the output " "which indicate if either A or B appear in the respective file. Each " "file should have a single ORF identifier on each line and contain " "nothing else.", nargs='*', default=default_id_matches) parser.add_argument('--id-match-names', help="A name to include in the " "output file for each --id-matches file. The number of names must " "match the number of files.", nargs='*', default=default_id_match_names) parser.add_argument('--overlaps', help="This is a list of bed12+ files " "which will be compared to the differential micropeptides. Two columns " "(one for A, one for B) will be added to the output which indicate if " "the respective micropeptides overlap a feature in each file by at " "least 1 bp.", nargs='*', default=default_overlaps) parser.add_argument('--overlap-names', help="A name to include in the " "output file for each --overlaps file. The number of names must match " "the number of files.", nargs='*', default=default_overlap_names) parser.add_argument('-r', '--ensembl-release', help="The version of Ensembl " "to use when mapping transcript identifiers to gene identifiers", type=int, default=default_ensembl_release) parser.add_argument('-s', '--ensembl-species', help="The Ensembl species " "to use when mapping transcript identifiers to gene identifiers", default=default_ensembl_species) parser.add_argument('--a-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_a is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--b-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_b is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--fields-to-keep', help="The fields to keep from the " "Bayes factor file for each condition", nargs='*', default=default_fields_to_keep) parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) " "length of ORFs considered as micropeptides", type=int, default=default_max_micropeptide_len) parser.add_argument('--do-not-fix-tcons', help="By default, the \"TCONS_\" " "identifiers from StringTie, etc., do not parse correctly; this script " "update the identifiers so that will parse correctly unless instructed not " "to. The script is likely to crash if the identifiers are not fixed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ensembl database" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl.db msg = "Checking the id-match and overlaps files" logger.info(msg) if len(args.id_matches) != len(args.id_match_names): msg = ("The number of --id-matches files and --id-match-names do not " "match. {} files and {} names".format(len(args.id_matches), len(args.id_match_names))) raise ValueError(msg) if len(args.overlaps) != len(args.overlap_names): msg = ("The number of --overlaps files and --overlaps-names do not " "match. {} files and {} names".format(len(args.overlaps), len(args.overlap_names))) raise ValueError(msg) utils.check_files_exist(args.id_matches) utils.check_files_exist(args.overlaps) if args.filter: msg = "Validating filter percentages" logger.info(msg) math_utils.check_range(args.read_filter_percent, 0, 1, variable_name="--read-filter-percent") math_utils.check_range(args.kl_filter_percent, 0, 1, variable_name="--kl-filter-percent") msg = "Extracting file names" logger.info(msg) config = yaml.load(open(args.config)) note_str = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) lengths_a = None offsets_a = None if args.a_is_single_sample: lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(config, args.name_a, is_unique=is_unique) bayes_factors_a = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_a): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_a, bayes_factors_a)) raise FileNotFoundError(msg) predicted_orfs_a = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_a): msg = ("Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_a, predicted_orfs_a)) raise FileNotFoundError(msg) lengths_b = None offsets_b = None if args.b_is_single_sample: lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(config, args.name_b, is_unique=is_unique) bayes_factors_b = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_b): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_b, bayes_factors_b)) raise FileNotFoundError(msg) predicted_orfs_b = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_b): msg = ("Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_b, predicted_orfs_b)) raise FileNotFoundError(msg) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_orf=True) if not os.path.exists(exons_file): msg = "Could not find the exons file ({}). Quitting.".format(exons_file) raise FileNotFoundError(msg) msg = "Reading the exons" logger.info(msg) exons = bed_utils.read_bed(exons_file) msg = "Reading the BF files" logger.info(msg) bf_df_a = bed_utils.read_bed(bayes_factors_a) bf_df_b = bed_utils.read_bed(bayes_factors_b) msg = "Reading the predictions files" logger.info(msg) bed_df_a = bed_utils.read_bed(predicted_orfs_a) bed_df_b = bed_utils.read_bed(predicted_orfs_b) differential_micropeptide_dfs = [] # extract micropeptides msg = "Extracting micropeptides" logger.info(msg) m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len micropeptides_a = bed_df_a[m_micropeptides_a] micropeptides_b = bed_df_b[m_micropeptides_b] long_orfs_a = bed_df_a[~m_micropeptides_a] long_orfs_b = bed_df_b[~m_micropeptides_b] msg = "Finding micropeptides in A with no overlap in B" logger.info(msg) micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons) micropeptides_a_no_match_b_df = pd.DataFrame() micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b) micropeptides_a_no_match_b_df['B'] = None micropeptides_a_no_match_b_df['kl'] = np.inf micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only' differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df) msg = "Finding micropeptides in B with no overlap in A" logger.info(msg) micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons) micropeptides_b_no_match_a_df = pd.DataFrame() micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a) micropeptides_b_no_match_a_df['A'] = None micropeptides_b_no_match_a_df['kl'] = np.inf micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only' differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df) msg = "Finding overlapping micropeptides" logger.info(msg) micropeptides_a_micropeptides_b_df = get_overlap_df(micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df) micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 'micro_a_long_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_long_b_df) micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 'long_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_b_long_a_df) differential_micropeptides_df = pd.concat(differential_micropeptide_dfs) msg = "Adding read count information" logger.info(msg) res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], left_on='A', right_on='id', how='left') to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_A', axis=1) res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left') to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_B', axis=1) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) if not args.do_not_fix_tcons: # replace TCONS_ with TCONS res['A'] = res['A'].str.replace("TCONS_", "TCONS") res['B'] = res['B'].str.replace("TCONS_", "TCONS") msg = "Extracting the genes and their biotypes using pyensembl" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl_transcript_ids = set(ensembl.transcript_ids()) biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A', ensembl, ensembl_transcript_ids) biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B', ensembl, ensembl_transcript_ids) biotypes_a = utils.remove_nones(biotypes_a) biotypes_b = utils.remove_nones(biotypes_b) biotypes_a = pd.DataFrame(biotypes_a) biotypes_b = pd.DataFrame(biotypes_b) res = res.merge(biotypes_a, on='A', how='left') res = res.merge(biotypes_b, on='B', how='left') msg = "Pulling annotations from mygene.info" logger.info(msg) # pull annotations from mygene gene_info_a = mygene_utils.query_mygene(res['gene_id_A']) gene_info_b = mygene_utils.query_mygene(res['gene_id_B']) # and add the mygene info res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', how='left') to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) res = res.merge(gene_info_b, left_on='gene_id_B', right_on='gene_id', how='left') to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) msg = "Removing duplicates" logger.info(msg) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) msg = "Adding --id-matches columns" logger.info(msg) for (id_match_file, name) in zip(args.id_matches, args.id_match_names): res = add_id_matches(res, id_match_file, name) msg = "Adding --overlaps columns" logger.info(msg) for (overlap_file, name) in zip(args.overlaps, args.overlap_names): res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons) msg = "Sorting by in-frame reads" logger.info(msg) res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0) res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0) res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B'] res = res.sort_values('x_1_sum', ascending=False) if args.filter: msg = "Filtering the micropeptides by read coverage and KL-divergence" logger.info(msg) x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', ascending=False) num_x_1_sum_ranks = x_1_sum_ranks.max() max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank msg = ("Number of micropeptides passing read filter: {}".format( sum(m_good_x_1_sum_rank))) logger.debug(msg) kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False) num_kl_ranks = kl_ranks.max() max_good_kl_rank = num_kl_ranks * args.kl_filter_percent m_good_kl_rank = kl_ranks <= max_good_kl_rank msg = ("Number of micropeptides passing KL filter: {}".format( sum(m_good_kl_rank))) logger.debug(msg) m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank msg = ("Number of micropeptides passing both filters: {}".format( sum(m_both_filters))) logger.debug(msg) res = res[m_both_filters] msg = "Writing differential micropeptides to disk" logger.info(msg) if args.append_sheet is None: pandas_utils.write_df(res, args.out, index=False) else: sheet_name = "{},{}".format(args.name_a, args.name_b) utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script removes all of the peptides which match to multiple " "ORFs from the results found with get-all-orf-peptide-matches.") parser.add_argument('peptide_matches', help="The peptide matches file produced " "by get-all-orf-peptide-matches") parser.add_argument('out', help="A similar peptide matches file which " "contains only peptides which match to a unique ORF") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading peptide matches" logger.info(msg) peptide_matches = pd.read_csv(args.peptide_matches) msg = "Splitting the grouped matches into individual peptide matches" logger.info(msg) matches = parallel.apply_parallel( peptide_matches, args.num_cpus, parse_matches, progress_bar=True) msg = "Removing peptides which match to multiple ORFs" logger.info(msg) matches = utils.remove_nones(matches) matches = utils.flatten_lists(matches) matches_df = pd.DataFrame(matches) unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False) msg = "Merging the ORF-peptide matches back to single records" logger.info(msg) unique_groups = unique_matches_df.groupby('orf_id') merged_unique_groups = parallel.apply_parallel_groups( unique_groups, args.num_cpus, merge_group, progress_bar=True) merged_unique_df = pd.DataFrame(merged_unique_groups) msg = "Re-adding the ORFs which no longer have peptide matches" logger.info(msg) m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id']) peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0 peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0 peps = [merged_unique_df, peptide_matches[~m_still_has_match]] merged_unique_df = pd.concat(peps) msg = "Writing the ORFs with unique matches to disk" logger.info(msg) pandas_utils.write_df(merged_unique_df, args.out, index=False)