def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with ' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=list(range(nlevels))): if "background" in key and not self.background: continue genesets[key] = set(group[column]) values = [] if len(genesets) == 2: a = set(genesets[list(genesets.keys())[0]]) b = set(genesets[list(genesets.keys())[1]]) values.append(("10", len(a - b))) values.append(("01", len(b - a))) values.append(("11", len(a & b))) values.append( ("labels", list(map(path2str, list(genesets.keys()))))) elif len(genesets) == 3: a = set(genesets[list(genesets.keys())[0]]) b = set(genesets[list(genesets.keys())[1]]) c = set(genesets[list(genesets.keys())[2]]) values.append(("100", len(a - b - c))) values.append(("010", len(b - a - c))) values.append(("001", len(c - a - b))) values.append(("110", len((a & b) - c))) values.append(("101", len((a & c) - b))) values.append(("011", len((b & c) - a))) values.append(("111", len((a & b) & c))) values.append( ("labels", list(map(path2str, list(genesets.keys()))))) else: raise ValueError( "Can currently only cope with 2 or 3 way intersections") return DataTree.listAsDataFrame(values)
def transform(self, data): # check if data is melted: if len(data.columns) != 1: raise ValueError( 'transformer requires dataframe with' 'a single column, got %s' % data.columns) column = data.columns[0] # iterate over lowest levels to build a dictionary of # sets genesets = {} nlevels = Utils.getDataFrameLevels(data) for key, group in data.groupby(level=range(nlevels)): genesets[path2str(key)] = set(group[column]) keys = genesets.keys() background = None foreground = [] for key in keys: if "background" in key: background = genesets[key] else: foreground.append(key) if len(keys) < 3 or background is None: raise ValueError( "Expected at least 3 lists, with one called background, " "instead got %i lists called %s" % (len(keys), ", ".join(keys))) missing = { y: [str(x) for x in genesets[y] if x not in background] for y in foreground} if any([len(missing[x]) > 0 for x in missing]): missing_items = "\n\t".join( ["%s:\t%s" % (x, ",".join(missing[x])) for x in missing]) raise ValueError( "Found items in lists not in background. " "Missing items:\n\t %s" % missing_items) M = len(set(background)) if len(keys) == 2: n = len(set(genesets[keys[1]])) N = len(set(genesets[keys[0]])) x = len(set(genesets[keys[0]]) & set(genesets[keys[1]])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) values = [("Enrichment", fc), ("P-value", p)] else: enrichments = [] pvals = [] As = [] Bs = [] for a, b in itertools.combinations(keys, 2): N = len(set(genesets[a])) n = len(set(genesets[b])) x = len(set(genesets[a]) & set(genesets[b])) p = scipy.stats.hypergeom.sf(x, M, n, N) fc = ((x + 0.0) / N) / ((n + 0.0) / M) As.append(a) Bs.append(b) pvals.append(p) enrichments.append(fc) values = [("ListA", As), ("ListB", Bs), ("Enrichment", enrichments), ("P-value", pvals)] return DataTree.listAsDataFrame(values, values_are_rows=True)