Example #1
0
 def resolved_data(self):
     data = self.data
     log = (log_durations(logger.debug, "resolving values")
            if self._enable_parametrization else nullcontext())
     with log:
         data = self.resolver.resolve()
     return data.get("stages", {})
Example #2
0
 def resolved_data(self):
     data = self.data
     if self._enable_parametrization:
         wdir = PathInfo(self.dvcfile.path).parent
         with log_durations(logger.debug, "resolving values"):
             resolver = DataResolver(self.repo, wdir, data)
             data = resolver.resolve()
     return data.get("stages", {})
Example #3
0
    def stages(self):
        data, _ = self._load()

        if self.repo.config["feature"]["parametrization"]:
            with log_durations(logger.debug, "resolving values"):
                resolver = DataResolver(data)
                data = resolver.resolve()

        lockfile_data = self._lockfile.load()
        return StageLoader(self, data.get("stages", {}), lockfile_data)
Example #4
0
def get_balanced_permutations(balanced, permutations):
    balanced_permutations = balanced
    if not permutations.empty:
        logger.info('Estimating significance by permutation for %s' %
                    analysis.analysis_name)
        with log_durations(
                logger.debug, 'Estimating significance by permutation for %s' %
                analysis.analysis_name):
            recs = []
            permutations.sort_index(inplace=True)  #to speed lookups
            # permutations.to_csv("permutations.csv")
            for mygene in balanced.index:
                perms = permutations.ix[mygene]
                random_TE = balanced.ix[mygene].random_TE
                random_side = 'right' if random_TE > 0 else "left"
                random_perms = perms.random_TE.order()
                random_nperms = random_perms.count()
                random_rank = random_perms.searchsorted(random_TE,
                                                        side=random_side)[0]
                if random_side == "right":
                    random_rank = random_nperms - random_rank
                random_pval_perm = float(random_rank) / random_nperms
                fixed_TE = balanced.ix[mygene].fixed_TE
                fixed_side = 'right' if fixed_TE > 0 else "left"
                fixed_perms = perms.fixed_TE.order()
                fixed_nperms = fixed_perms.count()
                fixed_rank = fixed_perms.searchsorted(fixed_TE,
                                                      side=fixed_side)[0]
                if fixed_side == "right":
                    fixed_rank = fixed_nperms - fixed_rank
                fixed_pval_perm = float(fixed_rank) / fixed_nperms
                rec = dict(random_rank=random_rank,
                           random_nperms=random_nperms,
                           random_pval_perm=random_pval_perm,
                           fixed_rank=fixed_rank,
                           fixed_nperms=fixed_nperms,
                           fixed_pval_perm=fixed_pval_perm)
                recs.append(rec)
            df = pd.DataFrame(recs)
            df.index = balanced.index
            balanced_permutations = balanced.join(df)
    return balanced_permutations
Example #5
0
def get_balanced_permutations(analysis, balanced, permutations):
    balanced_permutations = balanced
    if not permutations.empty:
        logger.info("Estimating significance by permutation for %s" % analysis.analysis_name)
        with log_durations(logger.debug, "Estimating significance by permutation for %s" % analysis.analysis_name):
            recs = []
            permutations.sort_index(inplace=True)  # to speed lookups
            # permutations.to_csv("permutations.csv")
            for mygene in balanced.index:
                perms = permutations.ix[mygene]
                random_TE = balanced.ix[mygene].random_TE
                random_side = "right" if random_TE > 0 else "left"
                random_perms = perms.random_TE.order()
                random_nperms = random_perms.count()
                random_rank = random_perms.searchsorted(random_TE, side=random_side)[0]
                if random_side == "right":
                    random_rank = random_nperms - random_rank
                random_pval_perm = float(random_rank) / random_nperms
                fixed_TE = balanced.ix[mygene].fixed_TE
                fixed_side = "right" if fixed_TE > 0 else "left"
                fixed_perms = perms.fixed_TE.order()
                fixed_nperms = fixed_perms.count()
                fixed_rank = fixed_perms.searchsorted(fixed_TE, side=fixed_side)[0]
                if fixed_side == "right":
                    fixed_rank = fixed_nperms - fixed_rank
                fixed_pval_perm = float(fixed_rank) / fixed_nperms
                rec = dict(
                    random_rank=random_rank,
                    random_nperms=random_nperms,
                    random_pval_perm=random_pval_perm,
                    fixed_rank=fixed_rank,
                    fixed_nperms=fixed_nperms,
                    fixed_pval_perm=fixed_pval_perm,
                )
                recs.append(rec)
            df = pd.DataFrame(recs)
            df.index = balanced.index
            balanced_permutations = balanced.join(df)
    return balanced_permutations
Example #6
0
def perform_analysis(analysis, debug=False):
    logger.info('Started %s analysis', analysis.analysis_name)
    with log_durations(logger.debug,
                       'Loading dataframe for %s' % analysis.analysis_name):
        df = get_analysis_df(analysis.case_query, analysis.control_query,
                             analysis.modifier_query)
    debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name)

    logger.info('Matching sources: %d' %
                df.groupby(['series_id', 'platform_id']).ngroups)

    # Remove single-class sources
    query = df.groupby(['series_id', 'platform_id'
                        ]).sample_class.agg(lambda x: set(x)) >= {0, 1}
    df = filter_sources(df, query, 'as single-class')

    # Check for minimum number of samples
    if analysis.min_samples:
        counts = df.groupby(['series_id', 'platform_id'
                             ]).sample_class.value_counts().unstack()
        query = (counts[0] >= analysis.min_samples) & (counts[1] >=
                                                       analysis.min_samples)
        df = filter_sources(df, query, 'by min samples')

    # Check number of sources
    sources = df.groupby(['series_id', 'platform_id']).ngroups
    if sources <= 1:
        logger.error("FAIL Can't perform meta-analysis on %s" %
                     ('single source' if sources else 'no data'))
        return

    # Calculating stats
    analysis.series_count = len(df.series_id.unique())
    analysis.platform_count = len(df.platform_id.unique())
    analysis.sample_count = len(df.sample_id.unique())
    analysis.series_ids = df.series_id.unique().tolist()
    analysis.platform_ids = df.platform_id.unique().tolist()
    analysis.sample_ids = df.sample_id.unique().tolist()
    # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count',
    #                              'series_ids', 'platform_ids', 'sample_ids'])
    logger.info('Stats: %d sources, %d series, %d platforms, %d samples' %
                (sources, analysis.series_count, analysis.platform_count,
                 analysis.sample_count))

    # Load GSE data, make and concat all fold change analyses results.
    # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once.
    logger.info('Loading data and calculating fold changes for %s',
                analysis.analysis_name)
    with log_durations(logger.debug,
                       'Load/fold for %s' % analysis.analysis_name):
        gses = (load_gse(df, series_id)
                for series_id in sorted(df.series_id.unique()))
        fold_changes = pd.concat(imap(get_fold_change_analysis, gses))
        debug and fold_changes.to_csv("%s.fc.csv" % debug)

    logger.info('Meta-Analyzing %s', analysis.analysis_name)
    with log_durations(logger.debug,
                       'Meta analysis for %s' % analysis.analysis_name):
        balanced = getFullMetaAnalysis(fold_changes, debug=debug).reset_index()
        debug and balanced.to_csv("%s.meta.csv" % debug)

    # logger.info('Inserting %s analysis results', analysis.analysis_name)
    # with log_durations(logger.debug, 'Saving results of %s' % analysis.analysis_name):#, \
    #         # transaction.atomic():
    #     balanced['analysis'] = analysis
    #     balanced.columns = balanced.columns.map(lambda x: x.replace(".", "_").lower())
    # field_names = [f.name for f in MetaAnalysis._meta.fields if f.name != 'id']
    # rows = balanced[field_names].T.to_dict().values()
    # Delete old values in case we recalculating analysis
    # MetaAnalysis.objects.filter(analysis=analysis).delete()
    # MetaAnalysis.objects.bulk_create(MetaAnalysis(**row) for row in rows)

    logger.info('DONE %s analysis', analysis.analysis_name)
    return balanced
Example #7
0
def perform_analysis(conn, analysis, debug=False):
    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

    logger.info('Started %s analysis', analysis.analysis_name)
    with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name):
        df = get_analysis_df(conn, analysis.case_query, analysis.control_query, analysis.modifier_query)
    debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name)

    logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups)

    # Remove single-class sources
    query = df.groupby(['series_id', 'platform_id']).sample_class.agg(lambda x: set(x)) >= {0, 1}
    df = filter_sources(df, query, 'as single-class')

    # Check for minimum number of samples
    if analysis.min_samples:
        counts = df.groupby(['series_id', 'platform_id']).sample_class.value_counts().unstack()
        query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples)
        df = filter_sources(df, query, 'by min samples')

    # Check number of sources
    sources = df.groupby(['series_id', 'platform_id']).ngroups
    if sources <= 1:
        logger.error("FAIL Can't perform meta-analysis on %s"
                     % ('single source' if sources else 'no data'))
        return

    # Calculating stats
    analysis.series_count = len(df.series_id.unique())
    analysis.platform_count = len(df.platform_id.unique())
    analysis.sample_count = len(df.sample_id.unique())
    analysis.series_ids = df.series_id.unique().tolist()
    analysis.platform_ids = df.platform_id.unique().tolist()
    analysis.sample_ids = df.sample_id.unique().tolist()
    # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count',
    #                              'series_ids', 'platform_ids', 'sample_ids'])
    logger.info('Stats: %d sources, %d series, %d platforms, %d samples'
                % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count))

    # Load GSE data, make and concat all fold change analyses results.
    # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once.
    logger.info('Loading data and calculating fold changes for %s', analysis.analysis_name)
    with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name):
        gses = (load_gse(cursor, df, series_id) for series_id in sorted(df.series_id.unique()))
        fold_changes = pd.concat(imap(get_fold_change_analysis, gses))
        debug and fold_changes.to_csv("%s.fc.csv" % debug)

    logger.info('Meta-Analyzing %s', analysis.analysis_name)
    with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name):
        balanced = getFullMetaAnalysis(fold_changes, debug=debug).reset_index()
        debug and balanced.to_csv("%s.meta.csv" % debug)

    # logger.info('Inserting %s analysis results', analysis.analysis_name)
    # with log_durations(logger.debug, 'Saving results of %s' % analysis.analysis_name):#, \
    #         # transaction.atomic():
    #     balanced['analysis'] = analysis
    #     balanced.columns = balanced.columns.map(lambda x: x.replace(".", "_").lower())
        # field_names = [f.name for f in MetaAnalysis._meta.fields if f.name != 'id']
        # rows = balanced[field_names].T.to_dict().values()
        # Delete old values in case we recalculating analysis
        # MetaAnalysis.objects.filter(analysis=analysis).delete()
        # MetaAnalysis.objects.bulk_create(MetaAnalysis(**row) for row in rows)

    logger.info('DONE %s analysis', analysis.analysis_name)
    return balanced
Example #8
0
def perform_analysis(analysis,
                     debug=False,
                     impute=False,
                     nperm=0,
                     mygene_filter=None):
    """
    Returns a tuple of sample_df, fold_change, balanced_permutations, permutations
    """
    logger.info('Started %s analysis', analysis.analysis_name)
    # from multiprocessing import Pool
    # pool = Pool(processes=4)

    with log_durations(logger.debug,
                       'Loading dataframe for %s' % analysis.analysis_name):
        df = get_analysis_df(analysis.case_query, analysis.control_query,
                             analysis.modifier_query)
    debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name)

    logger.info('Matching sources: %d' %
                df.groupby(['series_id', 'platform_id']).ngroups)

    # Remove single-class sources
    query = df.groupby([
        'series_id', 'platform_id'
    ]).sample_class.agg(lambda x: set(x)).map(lambda x: x >= {0, 1})
    df = filter_sources(df, query, 'as single-class')

    # Check for minimum number of samples
    if analysis.min_samples:
        counts = df.groupby(['series_id', 'platform_id'
                             ]).sample_class.value_counts().unstack()
        query = (counts[0] >= analysis.min_samples) & (counts[1] >=
                                                       analysis.min_samples)
        df = filter_sources(df, query, 'by min samples')

    # Check number of sources
    sources = df.groupby(['series_id', 'platform_id']).ngroups
    if sources <= 1:
        logger.error("FAIL Can't perform meta-analysis on %s" %
                     ('single source' if sources else 'no data'))
        return df, None, None, None

    # Calculating stats
    analysis.series_count = len(df.series_id.unique())
    analysis.platform_count = len(df.platform_id.unique())
    analysis.sample_count = len(df.sample_id.unique())
    analysis.series_ids = df.series_id.unique().tolist()
    analysis.platform_ids = df.platform_id.unique().tolist()
    analysis.sample_ids = df.sample_id.unique().tolist()
    # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count',
    #                              'series_ids', 'platform_ids', 'sample_ids'])
    logger.info('Stats: %d sources, %d series, %d platforms, %d samples' %
                (sources, analysis.series_count, analysis.platform_count,
                 analysis.sample_count))

    # Load GSE data, make and concat all fold change analyses results.
    # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once.
    logger.info('Loading data and calculating fold change for %s',
                analysis.analysis_name)
    with log_durations(logger.debug,
                       'Load/fold for %s' % analysis.analysis_name):
        gses = (load_gse(df, series_id, impute)
                for series_id in sorted(df.series_id.unique()))
        debugs = [debug] * df.series_id.nunique()
        nperms = [nperm] * df.series_id.nunique()
        mygene_filters = [mygene_filter] * df.series_id.nunique()

        # start a pool with 4 processes
        fold_change = pd.concat(
            imap(get_gene_fold_change, gses, debugs, nperms, mygene_filters))
        # fold_change = pd.concat(pool.imap(multi_run_wrapper, zip(gses, debugs, nperms)))
        debug and fold_change.to_csv("%s.fc.csv" % debug)

    #Start metaanalysis
    logger.info('Meta-Analyzing %s', analysis.analysis_name)
    with log_durations(logger.debug,
                       'Meta analysis for %s' % analysis.analysis_name):
        # logger.info('Meta analysis of real data for %s' % analysis.analysis_name)
        with log_durations(
                logger.debug,
                'meta analysis of real data for %s' % analysis.analysis_name):
            balanced = get_full_meta(fold_change.query("""perm == 0"""),
                                     debug=debug)
            debug and balanced.to_csv("%s.meta.csv" % debug)
        # logger.info('Meta-Analyzing of permutations for %s', analysis.analysis_name)
        with log_durations(
                logger.debug, 'meta analysis of permutations for %s' %
                analysis.analysis_name):
            permutations = pd.DataFrame()
            fold_change = fold_change.reset_index().sort('perm').set_index(
                'perm')
            for i in range(nperm):
                perm = i + 1
                # logger.info('Meta analysis of permutation %s for %s' % (perm, analysis.analysis_name))
                with log_durations(
                        logger.debug,
                        'meta analysis of permutation %s / %s for %s' %
                    (perm, nperm, analysis.analysis_name)):
                    # balanced_perm = get_full_meta(fold_change.query("""perm == %s"""%perm), debug=debug)
                    balanced_perm = get_full_meta(fold_change.ix[perm],
                                                  debug=debug)
                    permutation = balanced_perm[['random_TE', 'fixed_TE']]
                    permutation['perm'] = perm
                    permutations = pd.concat([permutations, permutation])
        balanced_permutations = get_balanced_permutations(
            balanced, permutations)

    logger.info('DONE %s analysis', analysis.analysis_name)
    return df, fold_change, balanced_permutations, permutations
Example #9
0
    def identifier(self) -> str:
        """Unique identifier for the index.

        We can use this to optimize and skip opening some indices
        eg: on push/pull/fetch/gc --all-commits.

        Currently, it is unique to the platform (windows vs posix).
        """
        return dict_md5(self.dumpd())


if __name__ == "__main__":
    from funcy import log_durations

    from dvc.repo import Repo

    repo = Repo()
    index = Index(repo, repo.fs)
    print(index)
    with log_durations(print, "collecting stages"):
        # pylint: disable=pointless-statement
        print("no of stages", len(index.stages))
    with log_durations(print, "building graph"):
        index.build_graph()
    with log_durations(print, "calculating hash"):
        print(index.identifier)
    with log_durations(print, "updating"):
        index2 = index.update(index.stages)
    with log_durations(print, "calculating hash"):
        print(index2.identifier)
Example #10
0
def perform_analysis(analysis, debug=False, impute=False, nperm=0, mygene_filter=None):
    """
    Returns a tuple of sample_df, fold_change, balanced_permutations, permutations
    """
    logger.info("Started %s analysis", analysis.analysis_name)
    # from multiprocessing import Pool
    # pool = Pool(processes=4)

    with log_durations(logger.debug, "Loading dataframe for %s" % analysis.analysis_name):
        df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query)
    debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name)

    logger.info("Matching sources: %d" % df.groupby(["series_id", "platform_id"]).ngroups)

    # Remove single-class sources
    query = df.groupby(["series_id", "platform_id"]).sample_class.agg(lambda x: set(x)).map(lambda x: x >= {0, 1})
    df = filter_sources(df, query, "as single-class")

    # Check for minimum number of samples
    if not df.empty and analysis.min_samples:
        counts = df.groupby(["series_id", "platform_id"]).sample_class.value_counts().unstack()
        query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples)
        df = filter_sources(df, query, "by min samples")

    # Check number of sources
    sources = df.groupby(["series_id", "platform_id"]).ngroups
    if sources <= 1:
        logger.error("FAIL Can't perform meta-analysis on %s" % ("single source" if sources else "no data"))
        return df, None, None, None

    # Calculating stats
    analysis.series_count = len(df.series_id.unique())
    analysis.platform_count = len(df.platform_id.unique())
    analysis.sample_count = len(df.sample_id.unique())
    analysis.series_ids = df.series_id.unique().tolist()
    analysis.platform_ids = df.platform_id.unique().tolist()
    analysis.sample_ids = df.sample_id.unique().tolist()
    # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count',
    #                              'series_ids', 'platform_ids', 'sample_ids'])
    logger.info(
        "Stats: %d sources, %d series, %d platforms, %d samples"
        % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)
    )

    # Load GSE data, make and concat all fold change analyses results.
    # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once.
    logger.info("Loading data and calculating fold change for %s", analysis.analysis_name)
    with log_durations(logger.debug, "Load/fold for %s" % analysis.analysis_name):
        gses = (load_gse(df, series_id, impute) for series_id in sorted(df.series_id.unique()))
        debugs = [debug] * df.series_id.nunique()
        nperms = [nperm] * df.series_id.nunique()
        mygene_filters = [mygene_filter] * df.series_id.nunique()

        # start a pool with 4 processes
        fold_change = pd.concat(imap(get_gene_fold_change, gses, debugs, nperms, mygene_filters))
        # fold_change = pd.concat(pool.imap(multi_run_wrapper, zip(gses, debugs, nperms)))
        debug and fold_change.to_csv("%s.fc.csv" % debug)

    # Start metaanalysis
    logger.info("Meta-Analyzing %s", analysis.analysis_name)
    with log_durations(logger.debug, "Meta analysis for %s" % analysis.analysis_name):
        # logger.info('Meta analysis of real data for %s' % analysis.analysis_name)
        with log_durations(logger.debug, "meta analysis of real data for %s" % analysis.analysis_name):
            balanced = get_full_meta(fold_change.query("""perm == 0"""), debug=debug)
            if balanced is None:
                logger.error("FAIL Got empty meta-analysis")
                return df, fold_change, None, None
            debug and balanced.to_csv("%s.meta.csv" % debug)

        # logger.info('Meta-Analyzing of permutations for %s', analysis.analysis_name)
        with log_durations(logger.debug, "meta analysis of permutations for %s" % analysis.analysis_name):
            permutations = pd.DataFrame()
            fold_change = fold_change.reset_index().sort("perm").set_index("perm")
            for i in range(nperm):
                perm = i + 1
                # logger.info('Meta analysis of permutation %s for %s' % (perm, analysis.analysis_name))
                with log_durations(
                    logger.debug, "meta analysis of permutation %s / %s for %s" % (perm, nperm, analysis.analysis_name)
                ):
                    # balanced_perm = get_full_meta(fold_change.query("""perm == %s"""%perm), debug=debug)
                    balanced_perm = get_full_meta(fold_change.ix[perm], debug=debug)
                    permutation = balanced_perm[["random_TE", "fixed_TE"]]
                    permutation["perm"] = perm
                    permutations = pd.concat([permutations, permutation])
        balanced_permutations = get_balanced_permutations(analysis, balanced, permutations)

    logger.info("DONE %s analysis", analysis.analysis_name)
    return df, fold_change, balanced_permutations, permutations
Example #11
0
 def resolved_data(self):
     data = self.data
     with log_durations(logger.debug, "resolving values"):
         data = self.resolver.resolve()
     return data.get("stages", {})