def weight_log_likelihood(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) wmat, wlabels, wfeatures = unpack_numpy(source[1].rstr()) newmat = mat * wmat #pickle.dump((newmat, labels, features), meta_open(target[0].rstr(), "w")) return None
def project_svd(target, source, env): args = source[-1].read() data, labels, features = unpack_numpy(source[0].rstr(), dense=False) proj, plabels, newfeatures = unpack_numpy(source[1].rstr(), dense=False) newdata = data * proj pack_numpy(target[0].rstr(), data=newdata, labels=labels, features=newfeatures) return None
def feature_union(target, source, env): args = source[-1].read() dmat, dlabels, dfeatures = unpack_numpy(source[0].rstr()) fmat, flabels, ffeatures = unpack_numpy(source[1].rstr()) temp_dfeatures = [(re.match("^[A-Z]+-(.*)_(\=|\+)_.*$", x).group(1), i) for i, x in enumerate(dfeatures)] ffeatures = [x.split("_=_")[-1] for x in ffeatures] keep = [i for x, i in temp_dfeatures if x in ffeatures] newfeatures = sorted([(dfeatures[i], i) for i in keep]) pack_numpy(target[0].rstr(), data=dmat[:, [i for n, i in newfeatures]], labels=dlabels, features=[n for n, i in newfeatures]) return None
def filter_same_features(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) refmat, reflabels, reffeatures = unpack_numpy(source[1].rstr()) mat = mat.tocsc() keep = [i for i, x in enumerate(features) if x in reffeatures] logging.info("keeping %d/%d features", len(keep), len(features)) newmat = sparse.lil_matrix((len(labels), len(keep))) for i, col in enumerate([mat.getcol(i) for i in keep]): newmat[:, i] = col newfeatures = [features[i] for i in keep] pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=newfeatures) return None
def normalize(target, source, env): old = numpy.seterr(divide='ignore', invalid='ignore') args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) feature_names = [x.get("_NAME") for x in features] feature_distros = [x.split("_=_")[0] for x in feature_names] distro_names = set(feature_distros) distros = dict([(i, []) for i in distro_names]) for i, d in enumerate(feature_distros): distros[d].append(i) logging.info("found %d distribution(s)", len(distros)) if len(distros) == 0: distros = {"ALL" : range(mat.shape[1])} C, R, V = [], [], [] mat = mat.tolil() newmat = sparse.lil_matrix(numpy.zeros(shape=mat.shape)) totals = numpy.empty(shape=(1, len(features))) for ri, row in [(i, mat.getrow(i).todense()) for i in range(mat.shape[0])]: logging.info("processing object %d/%d", ri, len(labels)) for di, (name, indices) in enumerate(distros.iteritems()): totals[0, indices] = row[:, indices].sum() newmat[ri, :] = row / totals pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=features) numpy.seterr(divide="warn", invalid="warn") return None
def add_definitions(target, source, env): mat, words, features = unpack_numpy(source[0].rstr()) fd = meta_open(target[0].rstr(), "w") fd.write("<?xml-stylesheet href=\"presentation.xsl\" type=\"text/xsl\" ?><xml>") defs = {} for elem in [e for e in et.parse(source[1].rstr()).getiterator() if e.tag.endswith("div") and e.attrib.get("type", "") == "entry"]: defs[elem.attrib["n"]] = elem #[x.text for x in elem.getiterator() if x.tag.endswith("item")] logging.info("loaded %d dictionary entries", len(defs)) for val, word in zip(mat, words): if word in defs: lemma = [x for x in defs[word].getiterator() if x.tag.endswith("w")][0].attrib["lemma"] fd.write("<w><lemma>%s</lemma><vals>" % lemma.encode("utf-8")) for h, v in zip(features, val): fd.write("<val>%s</val>\n" % h) fd.write("</vals><vals>") for h, v in zip(features, val): fd.write("<val>%f</val>\n" % v) fd.write("</vals><defs>") for d in [x for x in defs[word].getiterator() if x.tag.endswith("item")]: fd.write("<def>%s</def>\n" % d.text.encode("utf-8")) fd.write("</defs></w>\n") fd.write("</xml>") return None
def plot_scatter(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) X = mat[:, 0] Y = mat[:, 1] L = labels shapes = ['+', 'o', 's', '^', 'v', '<', '>', 'd', 'p', 'h', '8'] clustering = {} if env.get("CLUSTERS", None): for m in re.finditer("^(\d+) (\d+) \(\'?(.*?)\'?\)$", meta_open(env["CLUSTERS"].rstr()).read(), re.M): clustering[m.group(3)] = int(m.group(2)) for x, y, label in zip(X, Y, L): try: point = pyplot.scatter([x], [y], s=10, c="black", marker=shapes[clustering.get(label, 0)]) except: print x sys.exit() a = pyplot.annotate(label.title(), (x, y), xytext=(15, 0), textcoords="offset points", fontsize=10, arrowprops=dict(arrowstyle="-")) pyplot.xticks([], []) pyplot.yticks([], []) comps = {} xtext = features[0] pyplot.xlabel(xtext.rstrip().rstrip(",")) ytext = features[1] pyplot.ylabel(ytext) pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight") pyplot.cla() return None
def plot_spread(target, source, env): args = source[-1].read() raw_data = [] labels = [] for name, f in zip(args["SUBJECTS"], source): mat, l, features = unpack_numpy(f.rstr()) raw_data.append(sum([[col for c, col in enumerate(row) if c != r] for r, row in enumerate(mat)], [])) labels.append(name) raw_data = numpy.asarray(raw_data) #print data.min(), data.max() number = 50.0 rmin, rmax = raw_data.min(), raw_data.max() inc = (rmax - rmin) / number data = numpy.empty(shape=(len(labels), number)) for r, label in enumerate(labels): data[r, :] = [len([i for i in raw_data[r] if rmin + c * inc < i < rmin + (c + 1) * inc]) for c in range(int(number))] #print data.shape #pyplot.figure(figsize=(7, 7 * 6)) for i, ls in enumerate(["-", "--", "-.", ":"]): #pyplot.subplot(6, 1, i) for name, datum in [x for x in zip(args["SUBJECTS"], data)][i * len(labels) / 4 : (i + 1) * len(labels) / 4]: pyplot.plot(datum, label=name, ls=ls) inc = (rmax - rmin) / 4.0 pyplot.xticks([x * (50 / 4) for x in range(5)], ["%.5f" % (rmin + inc * x) for x in range(5)]) pyplot.xlabel("Jensen-Shannon Divergence") pyplot.ylabel("Number of intra-subdomain random sample pairs") pyplot.legend(prop={"size" : 6}, ncol=3) pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight") pyplot.cla() return None
def split_numpy(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) indices = {} for pat, fname in zip(args["PATTERNS"], target): indices = [i for i, x in enumerate(features) if x.get("_NAME", "_NAME").startswith("%s_" % pat) or x.get("_NAME", "_NAME").startswith("%s-" % pat)] pack_numpy(fname.rstr(), data=sparse.coo_matrix(mat.todense()[:, indices]), labels=labels, features=[features[i] for i in indices]) return None
def filter_features(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) for targ, filt in zip(target, args["FILTERS"]): indices = [i for i, x in enumerate(features) if x.startswith(filt)] newfeatures = [x for x in features if x.startswith(filt)] pickle.dump((mat[:, indices], labels, newfeatures), meta_open(targ.rstr(), "w")) return None
def remove_zero_features(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr(), dense=True) mat = numpy.asarray(mat) indices = [i for i, x in enumerate(mat.sum(0)) if x > 0] mat = mat[:, indices] features = [features[i] for i in indices] pack_numpy(target[0].rstr(), data=mat, labels=labels, features=features) return None
def compute_svd(target, source, env): args = source[-1].read() mat, docs, words = unpack_numpy(source[0].rstr(), dense=True) #mat = sparse.lil_matrix(mat) #mat = mat.asfptype() L, S, R = svd(mat, args.get("DIMENSIONS", 300)) pack_numpy(target[0].rstr(), data=L, labels=docs, features=[{"SVD" : str(i)} for i in range(1, L.shape[1] + 1)]) pack_numpy(target[1].rstr(), data=R.T, labels=words, features=[{"SVD" : str(i)} for i in range(1, R.shape[0] + 1)]) return None
def numpy_to_arff(target, source, env): mat, labels, features = unpack_numpy(source[0].rstr()) features = [x.get("_NAME") for x in features] fd = weka.Arff() for label, values in zip(labels, mat): datum = dict(zip(features, values)) datum.update(label) fd.add_datum(datum) fd.save(meta_open(target[0].rstr(), "w")) return None
def compute_log_likelihoods(target, source, env): args = source[-1].read() old = numpy.seterr(divide='ignore', invalid='ignore') mat, labels, features = unpack_numpy(source[0].rstr(), dense=False) mat = numpy.asarray(mat.todense()) totals = mat.sum(1) logging.info("created totals matrix of shape %s", totals.shape) newmat = numpy.empty(shape=mat.shape) for i in range(mat.shape[1]): newmat[:, i] = ct_log_likelihood(mat[:, i], totals) pack_numpy(target[0].rstr(), data=newmat, labels=labels, features=features) numpy.seterr(divide="warn", invalid="warn") return None
def compute_pairwise(target, source, env): args = source[-1].read() if "EQUAL" in env: equal = dict([(x, cPickle.load(open(y[0].rstr()))[0]) for x, y in env["EQUAL"].iteritems()]) old = numpy.seterr(divide='ignore', invalid='ignore') mat, labels, features = unpack_numpy(source[0].rstr(), dense=True) # feature_names = [x.get("_NAME") for x in features] # label_names = [x.get("_NAME") for x in labels] # if env.get("DISTROS", True): # distros = {} # for i, feat in enumerate(feature_names): # dist = feat.split("_=_")[0] # if dist not in distros: # distros[dist] = [] # distros[dist].append(i) # logging.info("comparing %d distributions", len(distros)) # else: # distros = {"ALL" : range(len(features))} # for k in distros.keys(): # distros[k] = mat[:, distros[k]] # pairwise_distro_vals = numpy.empty(shape=(len(labels), len(labels), len(distros))) # print pairwise_distro_vals.shape # distro_order = [] # for i, (name, distro) in enumerate(distros.iteritems()): # distro_order.append(name.split("-")[-1]) # if i % 1000 == 0: # logging.info("processed distro #%d/%d", i, len(distros)) # if "WEIGHTS" in args: # ldata, llabels, lfeatures = unpack_numpy(args["WEIGHTS"]) # lfeatures = [x.get("_NAME").split("_=_")[-1] for x in lfeatures] # indices = [i for i, x in enumerate(lfeatures) if x in distro_order] # zeros = [i for i, x in enumerate(distro_order) if x not in lfeatures and i <= len(indices) ] # ldata = ldata[:, indices] # ldata = numpy.insert(ldata, zeros + [len(indices) for i in range(len(distro_order) - len(zeros) - len(indices))], 0.0, axis=1) # weights = numpy.empty(shape=pairwise_distro_vals.shape) # for r in range(weights.shape[0]): # for c in range(weights.shape[1]): # weights[r, c, :] = ldata[[r, c], :].sum(0) # res = numpy.average(pairwise_distro_vals, axis=2, weights=weights) # else: # res = numpy.average(pairwise_distro_vals, axis=2) # newmat = pairwise_distro_vals.sum(2) / pairwise_distro_vals.shape[2] # newmat = newmat[[i for i, x in enumerate(labels) if x["_NAME"] not in args.get("FILTER", [])], :][:, [i for i, x in enumerate(labels) if x["_NAME"] not in args.get("FILTER", [])]] labels = [x for x in labels if x["_NAME"] not in args.get("FILTER", [])] data = env["FUNCTION"](mat) pack_numpy(target[0].rstr(), data=data, labels=labels, features=labels) numpy.seterr(divide="warn", invalid="warn") return None
def plot_dendrogram(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) roworder = [i for i, f in enumerate(sorted(labels)) if "LABEL_FILTER" not in env or f in env["LABEL_FILTER"]] colorder = [i for i, f in enumerate(sorted(features)) if "FEATURE_FILTER" not in env or f in env["FEATURE_FILTER"]] mat = mat[roworder][:, colorder][0:50] if args.get("DIRECT", False): Y = squareform(mat, checks=False) else: Y = pdist(mat, "cosine") Z = abs(linkage(Y, source[-1].read().get("LINKAGE", 'average'))) dendrogram(Z, labels=labels, orientation="right", color_threshold=-1, leaf_font_size="xx-small") pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight") pyplot.cla() pyplot.clf() return None
def filter_counts(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) mat = mat.tocsc() totals = mat.sum(0) if "THRESHOLD" in args: keep = [i for i, x in enumerate(totals.T) if x >= args["THRESHOLD"]] elif "NUMBER" in args: keep = [j for y, j in sorted([(x, i) for i, x in enumerate(totals.T)], reverse=True)[0:args["NUMBER"]]] pass logging.info("keeping %d/%d features", len(keep), len(totals.T)) mat = mat.tocsc() newmat = mat[:, keep] newfeatures = [features[i] for i in keep] pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=newfeatures) return None
def log_likelihood(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) newmat = numpy.empty(mat.shape) obs_totals = mat.sum(1) feat_totals = mat.sum(0) total = feat_totals.sum() for col, feature in enumerate(features): for row, label in enumerate(labels): ll = ct_log_likelihood([ (mat[row][col], obs_totals[row]), (feat_totals[col] - mat[row][col], total - obs_totals[row]) ]) newmat[row][col] = ll #pickle.dump((newmat, labels, features), meta_open(target[0].rstr(), "w")) return None
def plot_distros(target, source, env): """ Plots a matrix of distributions (rows) over features (columns) N per figure """ N = 5 args = source[-1].read() distros, labels, features = unpack_numpy(source[0].rstr(), dense=True) keep = [i for i, x in enumerate(distros) if x.sum() > 10] distros = distros[keep, :] labels = [labels[i] for i in keep] distros = numpy.transpose(distros.T / distros.sum(1)) #print distros[0] features = [x["_NAME"] for x in features] labels = [x["_NAME"] for x in labels] num_figs = 1 + int(len(labels) / N) pyplot.figure(figsize=(7, 7 * num_figs)) allvals = distros.sum(0) / distros.sum() order = [x[1] for x in sorted([(y, i) for i, y in enumerate(allvals) if y > 0], reverse=True)] for i in range(num_figs): start = N * i end = min(len(labels), N * (i + 1)) pyplot.subplot(num_figs, 1, i + 1) if args.get("GRAYSCALE"): pyplot.bar(left=[i for i in range(len(order))], height=[allvals[i] for i in order], label="Average", width=1.0 / num_figs) for j, (name, vals) in enumerate([x for x in zip(labels[start : end], distros[start : end])]): # if x[1].sum() > 0]: total = vals.sum() vals = vals / vals.sum() pyplot.bar(left=[i + j * (1.0 / num_figs) for i in range(len(order))], height=[vals[i] for i in order], label="%s" % name, width=1.0 / num_figs) else: pyplot.plot([allvals[i] for i in order], label="Average", lw=2) for name, vals in [x for x in zip(labels[start : end], distros[start : end])]: # if x[1].sum() > 0]: total = vals.sum() vals = vals / vals.sum() pyplot.plot([vals[i] for i in order], label="%s" % (name)) pyplot.legend(prop={"size" : 8}) pyplot.xticks(range(len(order)), [features[i].strip() for i in order], rotation=45, fontsize=6, ha="right") pyplot.gca().grid(color='lightgrey', linestyle='-', linewidth=1) pyplot.legend(prop={"size" : 8}) pyplot.xticks(range(len(order)), [features[i].strip() for i in order], rotation=45, fontsize=6, ha="right") pyplot.gca().grid(color='lightgrey', linestyle='-', linewidth=1) pyplot.savefig(target[0].rstr(), bbox_inches="tight") pyplot.cla() return None
def compute_lsa(target, source, env): args = source[-1].read() logging.info("starting...") mat, labels, features = unpack_numpy(source[0].rstr()) dimensions = args.get("DIMENSIONS", mat.shape[1]) matrix = mat[:, range(dimensions)] items = labels K = args.get("CLUSTERS", len(items) / 5) if args.get("METHOD", None) == "kmeans": logging.info("extracting %d clusters from %d singular values in term matrix of order %s", K, dimensions, matrix.shape) whitened = whiten(matrix) book = numpy.array((whitened[0], whitened[2])) codebook, distortion = kmeans(whitened, K) pickle.dump((items, codebook, distortion, whitened), meta_open(target[0].rstr(), "w")) elif args.get("METHOD", None) == "hierarchical": pass return None
def combine(target, source, env): files = [unpack_numpy(x.rstr(), dense=False) for x in source] all_labels = [] all_features = [dict(y) for y in sorted(set(sum([[tuple(i.iteritems()) for i in x[2]] for x in files], [])))] feature_mapping = dict([(tuple(b.iteritems()), a) for a, b in enumerate(all_features)]) R, C, V = [], [], [] label_offset = 0 for mat, labels, features in files: all_labels += [tuple(x.iteritems()) for x in labels] features_tuples = [tuple(x.iteritems()) for x in features] for row, col, val in zip(*sparse.find(mat)): R.append(row + label_offset) C.append(feature_mapping[features_tuples[col]]) V.append(val) label_offset += mat.shape[0] newmat = sparse.coo_matrix((V, (R, C)), shape=(len(all_labels), len(all_features))) pack_numpy(target[0].rstr(), data=newmat, labels=[dict(x) for x in all_labels], features=all_features) return None
def sum_over_labels(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) labels = [tuple(x.iteritems()) for x in labels] all_labels = sorted(set(labels)) label_mapping = dict([(b, a) for a, b in enumerate(all_labels)]) total = len(sparse.find(mat)[0]) if args.get("SINGLE", None): newlabels = [args.get("SINGLE", "?")] newmat = mat.sum(0) else: R, C, V = [], [], [] for row, col, val in zip(*sparse.find(mat)): R.append(label_mapping[labels[row]]) C.append(col) V.append(val) newmat = sparse.coo_matrix((V, (R, C)), shape=(len(all_labels), len(features))) pack_numpy(target[0].rstr(), data=newmat, labels=[dict(x) for x in all_labels], features=features) return None
def pairwise_log_likelihood(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) combos = list(set([frozenset([x, y]) for x in labels for y in labels if x != y])) newmat = numpy.empty((len(combos), len(features))) obs_totals = mat.sum(1) for row, combo in enumerate(combos): for col, feature in enumerate(features): rowA = labels.index(list(combo)[0]) rowB = labels.index(list(combo)[1]) if mat[rowA][col] == 0 and mat[rowB][col] == 0: ll = 0.0 else: ll = ct_log_likelihood([ (mat[rowA][col], obs_totals[rowA]), (mat[rowB][col], obs_totals[rowB]) ]) newmat[row][col] = ll #pickle.dump((newmat, combos, [x for x in features]), meta_open(target[0].rstr(), "w")) return None
def run_lda(target, source, env): logging.info("loading docs...") args = source[-1].read() mat, labels, words = unpack_numpy(source[0].rstr(), dense=True) data = [[] for x in range(mat.shape[0])] for i, w in enumerate(words): lemma = w["_NAME"] for j in range(len(data)): data[j] += [lemma.encode("utf-8") for x in range(mat[j, i])] docs = graphmod.DocumentCollection([x for x in data if len(x) > 0]) model = graphmod.LDA(args.get("topics", 100), args.get("alpha", .01), args.get("beta", .5)) model.load_docs(docs) for i in range(1, args.get("iterations", 1000) + 1): logging.info("iteration %d/%d" % (i, args.get("iterations", 1000))) model.sample() assigns = model.get_assignments() topics = model.get_topic_word_counts() #print topics.shape, mat.shape pickle.dump((mat, labels, words, assigns, topics), meta_open(target[0].rstr(), "w")) return None
def merge_features(target, source, env): args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) mat = mat.todense() merge = numpy.load(source[1].rstr()) if "cluster" in merge: newmat = numpy.empty(shape=(len(labels), merge["centers"].shape[0] + 1)) feature_map = dict([(k.get("_NAME"), v) for k, v in zip(merge["labels"], merge["cluster"])]) clusters = [[] for i in range(merge["centers"].shape[0] + 1)] for i, f in enumerate(features): clusters[feature_map.get(f.get("_NAME"), -1)].append(i) for i in range(len(clusters)): newmat[:, i] = mat[:, clusters[i]].sum(1) newfeatures = [str(i) for i in range(len(clusters))] else: newmat = sparse.lil_matrix(numpy.empty(shape=(len(labels), len(merge)))) features = [f.get("_NAME") for f in features] newfeatures = ["VCLUST_=_%s" % str(i) for i in range(len(merge))] for i, cluster in enumerate(merge): newmat[:, i] = mat[:, [features.index(f) for f in cluster if f in features]].sum(1) pack_numpy(target[0].rstr(), data=sparse.coo_matrix(newmat), labels=labels, features=newfeatures) return None
def npz_to_df(fname, rzero=True, transform=None, labelname="_NAME", featurename="_NAME", keep_labels=None, keep_features=None, transpose=False): mat, labels, features = unpack_numpy(fname) if sparse.issparse(mat): mat = numpy.asarray(mat.todense()) if transform: mat = transform(mat) if keep_labels: indices = [i for i, x in enumerate(labels) if x.get(labelname) in keep_labels] mat = mat[indices, :] labels = [labels[i] for i in indices] if keep_features: indices = [i for i, x in enumerate(features) if x[featurename] in keep_features] mat = mat[indices, :] features = [features[i] for i in indices] if transpose: mat = r["as.data.frame"](mat.T) mat.colnames = [x.get(labelname, i) for i, x in enumerate(labels)] mat.rownames = [x.get(featurename, i) for i, x in enumerate(features)] else: mat = r["as.data.frame"](mat) mat.rownames = [x.get(labelname, i) for i, x in enumerate(labels)] mat.colnames = [x.get(featurename, i) for i, x in enumerate(features)] return mat
def compare_distros(target, source, env): old = numpy.seterr(divide='ignore', invalid='ignore') mat, labels, features = unpack_numpy(source[0].rstr()) labels = [x.get("LABEL_filename") for x in labels] features = [x.get("_NAME") for x in features] over = set([x.split("_=_")[-1] for x in features]) mat = mat.todense() allverbs = [re.match("^[A-Z]+-(.*)_._.*$", x).group(1) for x in features] verbs = dict([(x, {"features" : [], "data" : {}}) for x in set(allverbs)]) for i, v in enumerate(allverbs): ov = features[i].split("_=_")[-1] verbs[v]["data"][ov] = mat[:, i].T.tolist()[0] logging.info("comparing %s distributions", len(verbs)) results = [] distros = [] verblist = [] for verb, vals in [x for x in verbs.iteritems() if len(x[1]) > 1]: distros.append(numpy.asarray([vals["data"].get(x, [0 for i in range(len(labels))]) for x in over]).T) #try: # numpy.asarray([vals["data"].get(x, [0 for i in range(len(labels))]) for x in over]) #except: # print [vals["data"].get(x, [0 for i in range(len(labels))]) for x in over] # sys.exit() jsd = jensen_shannon_divergence(distros[-1], counts=True) results.append(jsd) verblist.append(verb) distros = numpy.asarray(distros) logging.info("built distro matrix of shape %s", distros.shape) numpy.seterr(divide="warn", invalid="warn") numpy.savez(target[0].rstr(), results=results, distros=distros, #[sparse.lil_matrix(distros[:, :, i]) for i in range(distros.shape[-1])], verbs=verblist, labels=labels, features=list(over)) return None
def update_numpy_file(target, source, env): data, labels, features = unpack_numpy(source[0].rstr()) data = sparse.coo_matrix(data) pack_numpy(target[0].rstr(), data=data, labels=labels, features=features) return None
parser.add_option("-c", dest="cols", action="append", default=[]) parser.add_option("-o", dest="output", action="append", default=[]) parser.add_option("-i", dest="input", action="append", default=[]) parser.add_option("-t", dest="type", type="choice", choices=["sum_rows", "sum_cols", "print", "free"], default="print") options, args = parser.parse_args() logging.basicConfig(level = 0, format="%(asctime)s - %(message)s", filemode="w") for fname in options.input: if options.type == "free": fd = numpy.load(fname) for k in fd.keys(): print k, type(fd[k]), fd[k].shape continue mat, labels, features = unpack_numpy(fname, dense=True, oldstyle=True) labels = [x["_NAME"] for x in labels] features = [x["_NAME"] for x in features] row_indices = range(min(10, len(labels))) col_indices = range(min(10, len(features))) if options.rows: row_indices = [i for i, x in enumerate(labels) if any([re.match(rx, x) for rx in options.rows])] if options.cols: col_indices = [i for i, x in enumerate(features) if any([re.match(rx, x) for rx in options.cols])] print "\t".join([labels[i] for i in row_indices]) print "\t".join([features[i] for i in col_indices]) if options.type == "sum_cols": print mat[row_indices, :][:, col_indices].sum(1)
def plot_heatmap(target, source, env): """ For an Arff file with N instances of M numeric features, plot an NxM matrix of cells where a cell's intensity is determined by the feature's value in that instance. """ args = source[-1].read() mat, labels, features = unpack_numpy(source[0].rstr()) labels = [x.get(args.get("LABEL_NAME", "_NAME")) for x in labels] features = [x.get(args.get("FEATURE_NAME", "_NAME")) for x in features] samples = {} for sub, fname in args.get("sampled", {}).iteritems(): smat, slabels, sfeatures = unpack_numpy(fname) samples[sub] = sum([[smat[x, y] for x in range(smat.shape[0]) if x != y] for y in range(smat.shape[1])], []) roworder = [i for f, i in sorted([(f, i) for i, f in enumerate(labels)]) if "LABEL_FILTER" not in env or f in env["LABEL_FILTER"]] colorder = [i for f, i in sorted([(f, i) for i, f in enumerate(features)]) if "FEATURE_FILTER" not in env or f in env["FEATURE_FILTER"]] fig = pyplot.figure(figsize=(20, 20)) C = [] for row, datum in enumerate([mat[i] for i in roworder]): temp = [] for col, val in enumerate([datum[i] for i in colorder]): if row == col and "sampled" in args: sub = sorted(samples)[row] temp.append(env.get("TRANSFORM", lambda x : x)(sum(samples[sub]) / float(len(samples[sub])))) elif row == col and "reflect" in args: temp.append(0.0) elif row - col > 0: temp.append(env.get("TRANSFORM", lambda x : x)(val)) elif "sampled" in args: lab1 = sorted(labels)[row] lab2 = sorted(labels)[col] sub1 = float(len([i for i in samples[lab1] if i < val])) / len(samples[lab1]) sub2 = float(len([i for i in samples[lab2] if i < val])) / len(samples[lab2]) temp.append((sub1 + sub2) / 2.0) elif "reflect" in args: temp.append(env.get("TRANSFORM", lambda x : x)(val)) else: temp.append(1.0) C.append(temp) C = numpy.asarray(C) pyplot.gray() top = numpy.triu(C, 1) bottom = numpy.tril(C) rbottom = numpy.tril(C) if "sampled" in args: bottom = bottom / bottom.max() C_scaled = top + bottom pyplot.pcolor(C_scaled, vmin=0.0, vmax=1.0) # pyplot.title("Lighter indicates higher JSD/significance\nDiagonal shows homogeny from random samples\n\n\nJensen-Shannon Divergence (not comparable across features)\nValues range from %f (black) to %f (white)" % (min([rbottom[x[0], x[1]] for x in zip(*rbottom.nonzero())]), rbottom.max())) if "sampled" in args: pyplot.text(C.shape[1] + .1, C.shape[0] / 2, "Significance score (comparable across features), values range from 0%% (black) to 100%% (white)", va="center", rotation=270, fontsize=25) A = numpy.asarray(C) fs = 250 / len(colorder) for y in range(A.shape[0]): for x in range(A.shape[1]): if x < y or "sampled" in args or "reflect" in args: if A[y, x] < .5: c = "white" else: c = "black" pyplot.text(x + .05, y + .25, "%.1e" % (A[y, x]), color=c, fontsize=fs) if "sampled" not in args and "reflect" not in args: pyplot.text(A.shape[0] / 1.5, A.shape[1] / 2.5, "N/A", fontsize=50) fs = 600 / len(colorder) pyplot.xticks([x + .7 for x in range(len(colorder))], sorted([labels[x].title() for x in colorder]), rotation=45, ha="right", fontsize=fs) pyplot.yticks([x + .5 for x in range(len(roworder))], sorted([labels[x].title() for x in roworder]), fontsize=fs) pyplot.title("Jensen-Shannon Divergence", fontsize=25) #pyplot.title(args.get("title", ""), fontsize=20) #pyplot.text(40, 19, "Significance score, ranges from 0%% (black) to 100%% (white)", rotation=270, ha="center", va="center") pyplot.axes().autoscale_view() pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight") pyplot.cla() pyplot.clf() return None