def analyze(inp_fname, out_fname, plt_fname=None, k=3, thresh_init=1e-2, thresh_conv=1e-4, scaling="identity"): """Given cleaned data (output from `clean` function), run k-means clustering analysis, outputting textual and optionally plot output :param inp_fname: Input filename of raw data :param out_fname: Output filename for k-means data :param plt_fname: (optional) Output filename for k-means plot :param k: (optional) Number of k-means :param thresh_init: (optional) Threshold for initiation :param thresh_conv: (optional) Threshold for convergence :param scaling: (optional) Identity, zscore, or min-max scaling :returns: None """ scaling_funcs = { "identity": identity, "zscore": zscore, "minmax": minmax, } dct = load(inp_fname) keys = nest(dict.keys, filt(bool), tuple)(dct) rpkm = nest(part(getitems, dct), pipe(scaling_funcs[scaling]))(keys) sse = 1 sse_last = 0 while relerr(sse, sse_last) > thresh_init: sse_last = sse sse, labels, means = kstep(random.sample(rpkm, k), rpkm) sse = 1 sse_last = 0 while relerr(sse, sse_last) > thresh_conv: sse_last = sse sse, labels, _ = kstep(means, rpkm) means = mstep(labels, rpkm) if plt_fname is not None: plot_kmeans(plt_fname, means, labels, rpkm, dct[None], scaling) organs = pipe(lambda x: x.index(max(x)))(transpose(means)) index = sorted(range(len(organs)), key=organs.__getitem__) temp = collections.defaultdict(list) for i in index: temp[organs[i]].append(dct[None][i]) op = nest(sorted, "|".join, repr) with open(out_fname, "w") as f: f.write(f"ID,NAME,GROUP,ASSIGNMENT,URL\n") for (id_, name), group in sorted(zip(keys, labels), key=nest(reversed, tuple)): f.write( f"{id_},{name},{group},{op(temp[group])},{TEMPLATE_NCBI_GENE.format(id_)}\n")
def mstep(labels, values): """Generate new k-means given labeled RPKM values :param labels: K-mean assignments :param values: tuples of RPKM values :returns: New k-means """ dct = collections.defaultdict(list) for i, x in zip(labels, values): dct[i].append(x) return pipe(nest(transpose, pipe(mean)))(dct.values())
def sorted_fields(data): """Sort the fieldnames by mean RPKM :param data: Pruned data, tuple of dicts :returns: tuple of sorted fieldnames """ full = nest(get(0), dict.keys, tuple)(data) keys = nest(set, part(sub, set(VALID_KEYS)), sorted, tuple)(full) vals = pipe(lambda x: pipe(get(x, iskey=False))(keys))(data) trans = nest(zip, tuple)(*vals) ind = nest(len, range)(trans) op = get(pipe(mean)(trans), iskey=False) srt = nest(part(sorted, key=op, reverse=True), tuple)(ind) sorted_fields = VALID_KEYS + pipe(get(keys, iskey=False))(srt) return sorted_fields
def minmax(itr): if len(itr) <= 1: return lo, hi = min(itr), max(itr) rng = (hi - lo) op = nest(part(sub, lo), part(truediv, rng)) return pipe(op)(itr)
def convert(dct): """Convert a dict of str values to numeric where possible :param dct: dict with strs from cleaned data :returns: dict with numeric data """ return dict(zip(dct.keys(), pipe(safenum)(dct.values())))
def clean(inp_fname, out_fname): """Given raw data (output from `retrieve` function), remove the unnecessary information and sort the fieldnames by mean RPKM :param inp_fname: Input filename of raw data :param out_fname: Output filename for cleaned data :returns: tuple of dicts """ with open(inp_fname) as f: return nest(csv.DictReader, pipe(prune), lambda x: writecsv(x, out_fname, sorted_fields(x)))(f)
def load(fname): """Load from cleaned data :param fname: Filename of cleaned data :returns: dict containing reorganized data """ with open(fname) as f: return nest( csv.DictReader, pipe(convert), part(todatadict) )(f)
def todatadict(data): """Reorganize tuple of dicts into single dict where keys are defined by ID and NAME, values are numerical data corresponding to RPKM values :param data: tuple of dicts from loaded cleaned data :returns: dict containing reorganized data """ full = nest(get(0), dict.keys, tuple)(data) k = nest(filt(nest(VALID_KEYS.__contains__, lambda x: not x)), tuple)(full) dct = { (x["ID"], x["NAME"]): pipe(get(x, iskey=False))(k) for x in data } # remember the keys associated with the RPKM values dct[None] = k return dct
def retrieve(inp_fname, out_fname, include=bool): """Given an search export from NCBI Gene, retrieve RNA-Seq data on those GeneIDs, and then write out the data to an output file; the `include` argument is a function that will filter out particular rows of the input file :param inp_fname: Input filename of NCBI Gene search export :param out_fname: Output filename for writing raw data :param include: Function to filter out input rows :returns: tuple of dicts """ with open(inp_fname) as f: ids = nest(part(csv.DictReader, delimiter="\t"), filt(include), apply(nest(get("GeneID"), int)), sorted, tuple)(f) urls = pipe(TEMPLATE_NCBI_GENE.format)(ids) n = (1 + len(urls) // CHUNK_SIZE) print(f"A total of {n} chunks will be retrieved.") return procfull(out_fname)(urls)
def zscore(itr): mu = mean(itr) s = sd(itr) op = nest(part(sub, mu), part(truediv, s)) return pipe(op)(itr)
def var(itr): if len(itr) <= 1: return float("nan") mu = mean(itr) op = nest(part(sub, mu), part(pow, 2)) return sum(pipe(op)(itr)) / (len(itr) - 1)
def harmonic_mean(itr): return inv(mean(pipe(inv)(itr)))