def mergesort(filename, output=None, key=None, maxitems=1e6, progress=True): """Given an input file sort it by performing a merge sort on disk. :param filename: Either a filename as a ``str`` or a ``py._path.local.LocalPath`` instance. :type filename: ``str`` or ``py._path.local.LocalPath`` :param output: An optional output filename as a ``str`` or a ``py._path.local.LocalPath`` instance. :type output: ``str`` or ``py._path.local.LocalPath`` or ``None`` :param key: An optional key to sort the data on. :type key: ``function`` or ``None`` :param maxitems: Maximum number of items to hold in memory at a time. :type maxitems: ``int`` :param progress: Whether or not to display a progress bar :type progress: ``bool`` This uses ``py._path.local.LocalPath.make_numbered_dir`` to create temporry scratch space to work with when splitting the input file into sorted chunks. The mergesort is processed iteratively in-memory using the ``~merge`` function which is almost identical to ``~heapq.merge`` but adds in the support of an optional key function. """ p = filename if isinstance(filename, LocalPath) else LocalPath(filename) output = p if output is None else output key = key if key is not None else lambda x: x scratch = LocalPath.make_numbered_dir(prefix="mergesort-") nlines = sum(1 for line in p.open("r")) # Compute a reasonable chunksize < maxitems chunksize = first(ifilter(lambda x: x < maxitems, imap(lambda x: nlines / (2**x), count(1)))) # Split the file up into n sorted files if progress: bar = ProgressBar("Split/Sorting Data", max=(nlines / chunksize)) for i, items in enumerate(ichunks(chunksize, jsonstream(p))): with scratch.ensure("{0:d}.json".format(i)).open("w") as f: f.write("\n".join(map(dumps, sorted(items, key=key)))) if progress: bar.next() if progress: bar.finish() q = scratch.listdir("*.json") with output.open("w") as f: if progress: bar = ProgressBar("Merge/Sorting Data", max=nlines) for item in merge(*imap(jsonstream, q)): f.write("{0:s}\n".format(dumps(item))) if progress: bar.next() if progress: bar.finish()
def kick(self, sock, source, name, nick, reason=None): user = models.User.objects.filter(sock=sock).first() channel = models.Channel.objects.filter(name=name).first() if channel is None: return ERR_NOSUCHCHANNEL(name) if not user.oper and user not in channel.operators: return ERR_CHANOPRIVSNEEDED(channel.name) if nick not in imap(attrgetter("nick"), channel.users): return ERR_USERNOTINCHANNEL(nick, channel.name) nick = models.User.objects.filter(nick=nick).first() self.notify( channel.users[:], Message(u"KICK", channel.name, nick.nick, reason or nick.nick, prefix=user.prefix)) nick.channels.remove(channel) nick.save() channel.users.remove(nick) if user in channel.operators: channel.operators.remove(user) if user in channel.voiced: channel.voiced.remove(user) channel.save() if not channel.users: channel.delete()
def kick(self, sock, source, name, nick, reason=None): user = models.User.objects.filter(sock=sock).first() channel = models.Channel.objects.filter(name=name).first() if channel is None: return ERR_NOSUCHCHANNEL(name) if not user.oper and user not in channel.operators: return ERR_CHANOPRIVSNEEDED(channel.name) if nick not in imap(attrgetter("nick"), channel.users): return ERR_USERNOTINCHANNEL(nick, channel.name) nick = models.User.objects.filter(nick=nick).first() self.notify( channel.users[:], KICK(channel.name, nick.nick, reason or nick.nick, prefix=user.prefix) ) nick.channels.remove(channel) nick.save() channel.users.remove(nick) if user in channel.operators: channel.operators.remove(user) if user in channel.voiced: channel.voiced.remove(user) channel.save() if not channel.users: channel.delete()
def environment(self, vars_): """Set up environment variables to trigger analysis dumps from clang. We'll store all the havested metadata in the plugins temporary folder. """ tree = self.tree plugin_folder = os.path.dirname(__file__) flags = [ '-load', os.path.join(plugin_folder, 'libclang-index-plugin.so'), '-add-plugin', 'dxr-index', '-plugin-arg-dxr-index', tree.source_folder ] flags_str = " ".join(imap('-Xclang {}'.format, flags)) env = { 'CC': "clang %s" % flags_str, 'CXX': "clang++ %s" % flags_str, 'DXR_CLANG_FLAGS': flags_str, 'DXR_CXX_CLANG_OBJECT_FOLDER': tree.object_folder, 'DXR_CXX_CLANG_TEMP_FOLDER': self._temp_folder, } env['DXR_CC'] = env['CC'] env['DXR_CXX'] = env['CXX'] return merge(vars_, env)
def load_config(config): if "basicauth" not in config: raise ConfigError("Basic Auth not configured!") for param in ("passwd",): if param not in config["basicauth"]: raise ConfigError("Basic Auth not configured! Missing: {0}".format(repr(param))) config = config["basicauth"] realm = config.get("realm", "kdb") hasher = config.get("hasher", "sha") if hasher not in HASHERS: raise ConfigError("Unsupported hasher: {0}".format(repr(hasher))) with open(config["passwd"], "r") as f: users = dict(imap(rpartial(str.split, ":"), imap(str.strip, f))) return users, realm, hasher
def annotations_by_line(self): icon = "background-image: url('{0}/static/icons/warning.png');".format( self.tree.config.www_root) # TODO: DRY getter = itemgetter('msg', 'opt', 'span') for msg, opt, span in imap(getter, self.condensed.get('warnings', [])): if opt: msg = "{0}[{1}]".format(msg, opt) annotation = { 'title': msg, 'class': "note note-warning", 'style': icon } yield annotation, span
def jsonstream(filename, encoding="utf-8"): """Stream every line in the given file interpreting each line as JSON. :param filename: A ``str`` filename, A ``py._path.local.LocalPath`` instance or open ``file`` instnace. :type filename: ``str``, ``py._path.local.LocalPath`` or ``file``. :param encoding: A ``str`` indicating the charset/encoding to use. :type encoding: ``str`` :param stripchars: An iterable of characters to strip from the surrounding line. ``line.strip(...)`` is used. :type stripchars: ``list``, ``tuple`` or ``str`` This is a wrappedaround ``stream`` except that it wraps each line in a ``dumps`` call essentially treating each line as a piece of valid JSON. """ return imap(loads, stream(filename, encoding=encoding))
def replace(self, task_or_uuid, new_task): src_task = self.task(task_or_uuid) tasks = imap(lambda t: self.task(new_task) if t == src_task else t, self) self._atomic_write(tasks)
def perform_analysis(analysis, debug=False): logger.info('Started %s analysis', analysis.analysis_name) with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name): df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups) # Remove single-class sources query = df.groupby(['series_id', 'platform_id' ]).sample_class.agg(lambda x: set(x)) >= {0, 1} df = filter_sources(df, query, 'as single-class') # Check for minimum number of samples if analysis.min_samples: counts = df.groupby(['series_id', 'platform_id' ]).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, 'by min samples') # Check number of sources sources = df.groupby(['series_id', 'platform_id']).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ('single source' if sources else 'no data')) return # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info('Stats: %d sources, %d series, %d platforms, %d samples' % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info('Loading data and calculating fold changes for %s', analysis.analysis_name) with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name): gses = (load_gse(df, series_id) for series_id in sorted(df.series_id.unique())) fold_changes = pd.concat(imap(get_fold_change_analysis, gses)) debug and fold_changes.to_csv("%s.fc.csv" % debug) logger.info('Meta-Analyzing %s', analysis.analysis_name) with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name): balanced = getFullMetaAnalysis(fold_changes, debug=debug).reset_index() debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Inserting %s analysis results', analysis.analysis_name) # with log_durations(logger.debug, 'Saving results of %s' % analysis.analysis_name):#, \ # # transaction.atomic(): # balanced['analysis'] = analysis # balanced.columns = balanced.columns.map(lambda x: x.replace(".", "_").lower()) # field_names = [f.name for f in MetaAnalysis._meta.fields if f.name != 'id'] # rows = balanced[field_names].T.to_dict().values() # Delete old values in case we recalculating analysis # MetaAnalysis.objects.filter(analysis=analysis).delete() # MetaAnalysis.objects.bulk_create(MetaAnalysis(**row) for row in rows) logger.info('DONE %s analysis', analysis.analysis_name) return balanced
def process_function(props): # Compute FuncSig based on args: input_args = tuple( ifilter(bool, imap(str.lstrip, props['args'][1:-1].split(",")))) props['type'] = c_type_sig(input_args, props['type']) return props
def perform_analysis(conn, analysis, debug=False): cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) logger.info('Started %s analysis', analysis.analysis_name) with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name): df = get_analysis_df(conn, analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups) # Remove single-class sources query = df.groupby(['series_id', 'platform_id']).sample_class.agg(lambda x: set(x)) >= {0, 1} df = filter_sources(df, query, 'as single-class') # Check for minimum number of samples if analysis.min_samples: counts = df.groupby(['series_id', 'platform_id']).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, 'by min samples') # Check number of sources sources = df.groupby(['series_id', 'platform_id']).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ('single source' if sources else 'no data')) return # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info('Stats: %d sources, %d series, %d platforms, %d samples' % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info('Loading data and calculating fold changes for %s', analysis.analysis_name) with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name): gses = (load_gse(cursor, df, series_id) for series_id in sorted(df.series_id.unique())) fold_changes = pd.concat(imap(get_fold_change_analysis, gses)) debug and fold_changes.to_csv("%s.fc.csv" % debug) logger.info('Meta-Analyzing %s', analysis.analysis_name) with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name): balanced = getFullMetaAnalysis(fold_changes, debug=debug).reset_index() debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Inserting %s analysis results', analysis.analysis_name) # with log_durations(logger.debug, 'Saving results of %s' % analysis.analysis_name):#, \ # # transaction.atomic(): # balanced['analysis'] = analysis # balanced.columns = balanced.columns.map(lambda x: x.replace(".", "_").lower()) # field_names = [f.name for f in MetaAnalysis._meta.fields if f.name != 'id'] # rows = balanced[field_names].T.to_dict().values() # Delete old values in case we recalculating analysis # MetaAnalysis.objects.filter(analysis=analysis).delete() # MetaAnalysis.objects.bulk_create(MetaAnalysis(**row) for row in rows) logger.info('DONE %s analysis', analysis.analysis_name) return balanced
def perform_analysis(analysis, debug=False, impute=False, nperm=0, mygene_filter=None): """ Returns a tuple of sample_df, fold_change, balanced_permutations, permutations """ logger.info('Started %s analysis', analysis.analysis_name) # from multiprocessing import Pool # pool = Pool(processes=4) with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name): df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups) # Remove single-class sources query = df.groupby([ 'series_id', 'platform_id' ]).sample_class.agg(lambda x: set(x)).map(lambda x: x >= {0, 1}) df = filter_sources(df, query, 'as single-class') # Check for minimum number of samples if analysis.min_samples: counts = df.groupby(['series_id', 'platform_id' ]).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, 'by min samples') # Check number of sources sources = df.groupby(['series_id', 'platform_id']).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ('single source' if sources else 'no data')) return df, None, None, None # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info('Stats: %d sources, %d series, %d platforms, %d samples' % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info('Loading data and calculating fold change for %s', analysis.analysis_name) with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name): gses = (load_gse(df, series_id, impute) for series_id in sorted(df.series_id.unique())) debugs = [debug] * df.series_id.nunique() nperms = [nperm] * df.series_id.nunique() mygene_filters = [mygene_filter] * df.series_id.nunique() # start a pool with 4 processes fold_change = pd.concat( imap(get_gene_fold_change, gses, debugs, nperms, mygene_filters)) # fold_change = pd.concat(pool.imap(multi_run_wrapper, zip(gses, debugs, nperms))) debug and fold_change.to_csv("%s.fc.csv" % debug) #Start metaanalysis logger.info('Meta-Analyzing %s', analysis.analysis_name) with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name): # logger.info('Meta analysis of real data for %s' % analysis.analysis_name) with log_durations( logger.debug, 'meta analysis of real data for %s' % analysis.analysis_name): balanced = get_full_meta(fold_change.query("""perm == 0"""), debug=debug) debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Meta-Analyzing of permutations for %s', analysis.analysis_name) with log_durations( logger.debug, 'meta analysis of permutations for %s' % analysis.analysis_name): permutations = pd.DataFrame() fold_change = fold_change.reset_index().sort('perm').set_index( 'perm') for i in range(nperm): perm = i + 1 # logger.info('Meta analysis of permutation %s for %s' % (perm, analysis.analysis_name)) with log_durations( logger.debug, 'meta analysis of permutation %s / %s for %s' % (perm, nperm, analysis.analysis_name)): # balanced_perm = get_full_meta(fold_change.query("""perm == %s"""%perm), debug=debug) balanced_perm = get_full_meta(fold_change.ix[perm], debug=debug) permutation = balanced_perm[['random_TE', 'fixed_TE']] permutation['perm'] = perm permutations = pd.concat([permutations, permutation]) balanced_permutations = get_balanced_permutations( balanced, permutations) logger.info('DONE %s analysis', analysis.analysis_name) return df, fold_change, balanced_permutations, permutations
def perform_analysis(analysis, debug=False, impute=False, nperm=0, mygene_filter=None): """ Returns a tuple of sample_df, fold_change, balanced_permutations, permutations """ logger.info("Started %s analysis", analysis.analysis_name) # from multiprocessing import Pool # pool = Pool(processes=4) with log_durations(logger.debug, "Loading dataframe for %s" % analysis.analysis_name): df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info("Matching sources: %d" % df.groupby(["series_id", "platform_id"]).ngroups) # Remove single-class sources query = df.groupby(["series_id", "platform_id"]).sample_class.agg(lambda x: set(x)).map(lambda x: x >= {0, 1}) df = filter_sources(df, query, "as single-class") # Check for minimum number of samples if not df.empty and analysis.min_samples: counts = df.groupby(["series_id", "platform_id"]).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, "by min samples") # Check number of sources sources = df.groupby(["series_id", "platform_id"]).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ("single source" if sources else "no data")) return df, None, None, None # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info( "Stats: %d sources, %d series, %d platforms, %d samples" % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count) ) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info("Loading data and calculating fold change for %s", analysis.analysis_name) with log_durations(logger.debug, "Load/fold for %s" % analysis.analysis_name): gses = (load_gse(df, series_id, impute) for series_id in sorted(df.series_id.unique())) debugs = [debug] * df.series_id.nunique() nperms = [nperm] * df.series_id.nunique() mygene_filters = [mygene_filter] * df.series_id.nunique() # start a pool with 4 processes fold_change = pd.concat(imap(get_gene_fold_change, gses, debugs, nperms, mygene_filters)) # fold_change = pd.concat(pool.imap(multi_run_wrapper, zip(gses, debugs, nperms))) debug and fold_change.to_csv("%s.fc.csv" % debug) # Start metaanalysis logger.info("Meta-Analyzing %s", analysis.analysis_name) with log_durations(logger.debug, "Meta analysis for %s" % analysis.analysis_name): # logger.info('Meta analysis of real data for %s' % analysis.analysis_name) with log_durations(logger.debug, "meta analysis of real data for %s" % analysis.analysis_name): balanced = get_full_meta(fold_change.query("""perm == 0"""), debug=debug) if balanced is None: logger.error("FAIL Got empty meta-analysis") return df, fold_change, None, None debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Meta-Analyzing of permutations for %s', analysis.analysis_name) with log_durations(logger.debug, "meta analysis of permutations for %s" % analysis.analysis_name): permutations = pd.DataFrame() fold_change = fold_change.reset_index().sort("perm").set_index("perm") for i in range(nperm): perm = i + 1 # logger.info('Meta analysis of permutation %s for %s' % (perm, analysis.analysis_name)) with log_durations( logger.debug, "meta analysis of permutation %s / %s for %s" % (perm, nperm, analysis.analysis_name) ): # balanced_perm = get_full_meta(fold_change.query("""perm == %s"""%perm), debug=debug) balanced_perm = get_full_meta(fold_change.ix[perm], debug=debug) permutation = balanced_perm[["random_TE", "fixed_TE"]] permutation["perm"] = perm permutations = pd.concat([permutations, permutation]) balanced_permutations = get_balanced_permutations(analysis, balanced, permutations) logger.info("DONE %s analysis", analysis.analysis_name) return df, fold_change, balanced_permutations, permutations
def process_function(props): # Compute FuncSig based on args: input_args = tuple(ifilter( bool, imap(str.lstrip, props['args'][1:-1].split(",")))) props['type'] = c_type_sig(input_args, props['type']) return props