Esempio n. 1
0
    def create_solver_schedules(self, scenario):
        """ Use the fit pipeline to create solver schedules for the scenario
        instances
        """
        check_is_fitted(self, ["pipeline_"])

        # currently, just take the predicted best solver
        X_test = scenario.feature_data
        y_test = scenario.performance_data

        X_test = X_test.values
        y_pred = self.predict(X_test)

        msg = "X_test.shape: {}. y_pred.shape: {}".format(
            X_test.shape, y_pred.shape)
        logger.debug(msg)

        choices = self.inverse_transform(y_pred)
        it = zip(choices, scenario.instances)
        schedules = {}

        for choice, instance in it:
            if scenario.performance_type[0] == "runtime":
                solver_schedule = self.feature_steps + [[
                    choice, scenario.algorithm_cutoff_time
                ]]
            elif scenario.performance_type[0] == "solution_quality":
                solver_schedule = [[choice, 999999999999]]

            schedule = utils.remove_nones(solver_schedule)

            schedules[instance] = schedule

        return schedules
Esempio n. 2
0
def get_orf_positions(seq, start_codons_re, stop_codons_re):
    """ This function extracts the relative position of all ORFs from the given
        sequence. It assumes the sequence does not include any whitespace, and
        that the regular expressions properly identify start and stop codons.
        For example, if seq has already been transcribed, then the start codon
        should be "AUG".
        
        N.B. The ORFs *include* the first base in the start codon (e.g., "A" in
        "ATG") 

        Args:
            seq (string) : the (untranslated) sequence

            start_codons_re, stop_codons_re (compiled regular expression):
                regular expressions which identify start and stop codons, respectively

        Returns:
            list of orf_positions (a named 2-tuple with "start" and "end" fields)
            

        Example usage:
            orfs = get_orfs(seq, start_codons_re, stop_codons_re)

            first_orf_start = orf_starts[0, 0]
            first_orf_end = orf_ends[0, 1]
    """

    # these give the positions of the
    start_pos = np.array([m.start() for m in start_codons_re.finditer(seq)])
    stop_pos = np.array([m.start() for m in stop_codons_re.finditer(seq)])

    # pull out the matching ends for each start
    orfs = [get_matching_stop_position(s, stop_pos) for s in start_pos]
    orfs = utils.remove_nones(orfs)
    return orfs
Esempio n. 3
0
    def refit(self, scenario):
        """ Update the parameters, but not hyperparameters, of the ensemble
        and other pipeline members using the new training data.

        N.B. This starts training from scratch. It does not use "warm starts".
        Also, due to some technical details, some members of the ensembles fit
        with auto-sklearn cannot be refit. Those are discarded.

        Parameters
        ----------
        scenario: an ASlibScenario
            Presumably, this is something like a (different) cv split of the
            data originally used to train the pipeline.

        Returns
        -------
        self
        """

        check_is_fitted(self, "pipeline_")

        # overwrite whatever training data we had
        self.X_train = scenario.feature_data.values
        self.y_train = scenario.performance_data

        # we can just overwrite most of the existing pipeline
        feature_selector = self.pipeline_.named_steps['feature_selector']
        nss = self.pipeline_.named_steps['nss']
        imputer = self.pipeline_.named_steps.get('imputer')
        selector = self.pipeline_.named_steps['selector']

        # none of the preprocessing has tunable hyperparameters

        # we do not always use an imputer, though
        i = None
        if imputer is not None:
            i = ('imputer', imputer)

        p = utils.remove_nones([('feature_selector', feature_selector),
                                ('nss', copy.deepcopy(nss)), i])

        # fit the first part of the pipeline to transform the training data
        p = sklearn.pipeline.Pipeline(p)
        p_fit = p.fit(self.X_train, self.y_train)

        # now, transform our data so we can send it to the ensemble

        X_tr = p_fit.transform(self.X_train)
        selector_refit = selector.refit(X_tr, self.y_train)

        # finally, reconstruct our refit pipeline
        p_fit = p_fit.steps
        p_fit.append(("selector", selector_refit))
        self.pipeline_ = sklearn.pipeline.Pipeline(p_fit)

        return self
Esempio n. 4
0
    def create_schedules(self, scenario):
        """ Create the algorithm selection schedules for all instances in the
        scenario
        """

        presolver_schedules = self.presolver_scheduler_.create_presolver_schedule(
            scenario)
        solver_schedules = self.pipeline_.create_solver_schedules(scenario)

        schedules = {}

        for instance in scenario.instances:

            schedule = utils.remove_nones(presolver_schedules[instance] +
                                          solver_schedules[instance])

            schedules[instance] = schedule

        return schedules
Esempio n. 5
0
    def fit(self, scenario):
        """ Fit the pipeline using the ASlibScenario
        """

        if self.features is None:
            self.feature_columns_ = len(scenario.feature_data.columns)
            self.feature_columns_ = np.arange(self.feature_columns_, dtype=int)
        else:
            self.feature_columns_ = [
                scenario.feature_data.columns.get_loc(c) for c in self.features
            ]

        feature_selector = mlxtend.feature_selection.ColumnSelector(
            cols=self.feature_columns_)

        nss = NaNStandardScaler()
        as_asl_ensemble = ASaslEnsemble(
            args=self.args,
            solvers=scenario.algorithms,
            use_random_forests=self.use_random_forests)

        # if we are using random forests, then we must also impute
        # missing values
        imputer = None
        if self.use_random_forests:
            imputer = automl_utils.get_imputer("zero_fill")
            imputer = ('imputer', imputer)

        pipeline = utils.remove_nones([('feature_selector', feature_selector),
                                       ('nss', nss), imputer,
                                       ('selector', as_asl_ensemble)])

        self.pipeline = sklearn.pipeline.Pipeline(pipeline)

        self.X_train = scenario.feature_data.values
        self.y_train = scenario.performance_data

        self.pipeline_ = self.pipeline.fit(self.X_train, self.y_train)

        return self
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script estimates the Bayes factors for all metagene profiles in the "
        "given file. The script accepts as input multiple \"periodic\" and \"nonperiodic\" "
        "models. It uses the models of each type with the best mean to estimate the Bayes "
        "factor distributions.\n\nIt contains some hard-coded field names.")

    parser.add_argument('metagene_profiles',
                        help="The (csv) file containing the metagene profiles")
    parser.add_argument('out', help="The output (csv.gz) file")

    parser.add_argument(
        '--periodic-models',
        help="A list of pickled StanModel files which contain "
        "models that somehow represent periodic metagene profiles",
        nargs="+",
        default=default_periodic_models)
    parser.add_argument(
        '--nonperiodic-models',
        help="A list of pickled StanModel files which contain "
        "models that somehow represent nonperiodic metagene profiles",
        nargs="+",
        default=default_nonperiodic_models)

    parser.add_argument(
        '--periodic-offset-start',
        help="The position, relative to the translation "
        "initiation site, to begin calculating periodicity Bayes factors (inclusive)",
        type=int,
        default=default_periodic_offset_start)
    parser.add_argument(
        '--periodic-offset-end',
        help="The position, relative to the translation "
        "initiation site, to stop calculating periodicity Bayes factors (inclusive)",
        type=int,
        default=default_periodic_offset_end)
    parser.add_argument(
        '--metagene-profile-length',
        help="The length of the profile to use in the "
        "models. metagene_profile_length + periodic_offset_end must be consistent with the length "
        "of the extracted metagene profile. The length must be divisible by three.",
        type=int,
        default=default_metagene_profile_length)

    parser.add_argument('-s',
                        '--seed',
                        help="The random seeds to use for inference",
                        type=int,
                        default=default_seed)
    parser.add_argument('-c',
                        '--chains',
                        help="The number of MCMC chains to use",
                        type=int,
                        default=default_chains)
    parser.add_argument('-i',
                        '--iterations',
                        help="The number of MCMC iterations to use for "
                        "each chain",
                        type=int,
                        default=default_iterations)

    parser.add_argument(
        '-p',
        '--num-cpus',
        help="The number of CPUs to use. Each read "
        "length will be processed in its own thread (so that is the maximum number of CPUs "
        "that is useful).",
        type=int,
        default=default_num_cpus)

    parser.add_argument('--type-field', default=default_type_field)
    parser.add_argument('--count-field', default=default_count_field)
    parser.add_argument('--position-field', default=default_position_field)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # we will parallelize based on the lengths. So we need to know which lengths
    # are present in the metagene profiles file
    metagene_profiles = pd.read_csv(args.metagene_profiles)
    lengths = list(metagene_profiles['length'].unique())

    length_str = ','.join(str(int(l)) for l in lengths)
    msg = "Estimating Bayes factors for lengths: {}".format(length_str)
    logger.info(msg)

    length_groups = metagene_profiles.groupby('length')

    with suppress_stdout_stderr():

        all_profile_estimates_df = parallel.apply_parallel_groups(
            length_groups,
            args.num_cpus,
            estimate_profile_bayes_factors,
            args,
            progress_bar=True)

    msg = "Combining estimates into one data frame"
    logger.info(msg)

    all_profile_estimates_df = utils.remove_nones(all_profile_estimates_df)
    all_profile_estimates_df = pd.concat(all_profile_estimates_df)

    pandas_utils.write_df(all_profile_estimates_df, args.out, index=False)
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all of the peptides which match to multiple "
        "ORFs from the results found with get-all-orf-peptide-matches.")

    parser.add_argument('peptide_matches', help="The peptide matches file produced "
        "by get-all-orf-peptide-matches")
    parser.add_argument('out', help="A similar peptide matches file which "
        "contains only peptides which match to a unique ORF")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading peptide matches"
    logger.info(msg)

    peptide_matches = pd.read_csv(args.peptide_matches)

    msg = "Splitting the grouped matches into individual peptide matches"
    logger.info(msg)

    matches = parallel.apply_parallel(  peptide_matches, 
                                        args.num_cpus, 
                                        parse_matches, 
                                        progress_bar=True)

    msg = "Removing peptides which match to multiple ORFs"
    logger.info(msg)

    matches = utils.remove_nones(matches)
    matches = utils.flatten_lists(matches)
    matches_df = pd.DataFrame(matches)
    unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False)

    msg = "Merging the ORF-peptide matches back to single records"
    logger.info(msg)

    unique_groups = unique_matches_df.groupby('orf_id')
    merged_unique_groups = parallel.apply_parallel_groups(  unique_groups, 
                                                            args.num_cpus, 
                                                            merge_group, 
                                                            progress_bar=True)

    merged_unique_df = pd.DataFrame(merged_unique_groups)

    msg = "Re-adding the ORFs which no longer have peptide matches"
    logger.info(msg)

    m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id'])
    peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0
    peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0

    peps = [merged_unique_df, peptide_matches[~m_still_has_match]]
    merged_unique_df = pd.concat(peps)

    msg = "Writing the ORFs with unique matches to disk"
    logger.info(msg)

    utils.write_df(merged_unique_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument('-a', '--append-sheet', help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)", action='store_true')

    parser.add_argument('-f', '--filter', help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument('--read-filter-percent', help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.", type=float, 
        default=default_read_filter_percent)

        
    parser.add_argument('--kl-filter-percent', help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.", type=float, 
        default=default_kl_filter_percent)

    parser.add_argument('--id-matches', help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.", nargs='*', default=default_id_matches)

    parser.add_argument('--id-match-names', help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.", nargs='*', default=default_id_match_names)

    parser.add_argument('--overlaps', help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.", nargs='*', default=default_overlaps)

    parser.add_argument('--overlap-names', help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.", nargs='*', default=default_overlap_names)

    parser.add_argument('-r', '--ensembl-release', help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers", 
        type=int, default=default_ensembl_release)

    parser.add_argument('-s', '--ensembl-species', help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers", 
        default=default_ensembl_species)

    parser.add_argument('--a-is-single-sample', help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.", action='store_true')

    parser.add_argument('--b-is-single-sample', help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.", action='store_true')

    parser.add_argument('--fields-to-keep', help="The fields to keep from the "
        "Bayes factor file for each condition", nargs='*', 
        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) "
        "length of ORFs considered as micropeptides", type=int, 
        default=default_max_micropeptide_len)

    parser.add_argument('--do-not-fix-tcons', help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, 
        species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
            "match. {} files and {} names".format(len(args.id_matches), 
            len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
            "match. {} files and {} names".format(len(args.overlaps), 
            len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent, 0, 1, 
            variable_name="--read-filter-percent")
            
        math_utils.check_range(args.kl_filter_percent, 0, 1, 
            variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(config, 
                args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name_a, 
        length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
            format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], 
        args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations,
        is_filtered=True, is_chisq=False)
    
    if not os.path.exists(predicted_orfs_a):
        msg = ("Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)


    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(config, 
                args.name_b, is_unique=is_unique)
        
    bayes_factors_b = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name_b, 
        length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
            format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], 
        args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, 
        fraction=fraction, reweighting_iterations=reweighting_iterations,
        is_filtered=True, is_chisq=False)
    
    if not os.path.exists(predicted_orfs_b):
        msg = ("Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'],
        note=config.get('orf_note'), is_orf=True)

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(micropeptides_a, 
        micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 
        'micro_a_long_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)


    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 
        'long_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], 
        left_on='A', right_on='id', how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, 
        species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 
        'A', ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 
        'B', ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', 
        how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b, left_on='gene_id_B', 
        right_on='gene_id', how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)
        
    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', 
            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank
        
        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]


    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        pandas_utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all of the peptides which match to multiple "
        "ORFs from the results found with get-all-orf-peptide-matches.")

    parser.add_argument('peptide_matches', help="The peptide matches file produced "
        "by get-all-orf-peptide-matches")
    parser.add_argument('out', help="A similar peptide matches file which "
        "contains only peptides which match to a unique ORF")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading peptide matches"
    logger.info(msg)

    peptide_matches = pd.read_csv(args.peptide_matches)

    msg = "Splitting the grouped matches into individual peptide matches"
    logger.info(msg)

    matches = parallel.apply_parallel(  peptide_matches, 
                                        args.num_cpus, 
                                        parse_matches, 
                                        progress_bar=True)

    msg = "Removing peptides which match to multiple ORFs"
    logger.info(msg)

    matches = utils.remove_nones(matches)
    matches = utils.flatten_lists(matches)
    matches_df = pd.DataFrame(matches)
    unique_matches_df = matches_df.drop_duplicates(subset='peptide', keep=False)

    msg = "Merging the ORF-peptide matches back to single records"
    logger.info(msg)

    unique_groups = unique_matches_df.groupby('orf_id')
    merged_unique_groups = parallel.apply_parallel_groups(  unique_groups, 
                                                            args.num_cpus, 
                                                            merge_group, 
                                                            progress_bar=True)

    merged_unique_df = pd.DataFrame(merged_unique_groups)

    msg = "Re-adding the ORFs which no longer have peptide matches"
    logger.info(msg)

    m_still_has_match = peptide_matches['orf_id'].isin(merged_unique_df['orf_id'])
    peptide_matches.loc[~m_still_has_match, 'num_matches'] = 0
    peptide_matches.loc[~m_still_has_match, 'peptide_matches'] = 0

    peps = [merged_unique_df, peptide_matches[~m_still_has_match]]
    merged_unique_df = pd.concat(peps)

    msg = "Writing the ORFs with unique matches to disk"
    logger.info(msg)

    pandas_utils.write_df(merged_unique_df, args.out, index=False)