Beispiel #1
0
def weight_log_likelihood(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    wmat, wlabels, wfeatures = unpack_numpy(source[1].rstr())
    newmat = mat * wmat
    #pickle.dump((newmat, labels, features), meta_open(target[0].rstr(), "w"))
    return None
Beispiel #2
0
def project_svd(target, source, env):
    args = source[-1].read()
    data, labels, features = unpack_numpy(source[0].rstr(), dense=False)
    proj, plabels, newfeatures = unpack_numpy(source[1].rstr(), dense=False)
    newdata = data * proj
    pack_numpy(target[0].rstr(), data=newdata, labels=labels, features=newfeatures)
    return None
Beispiel #3
0
def feature_union(target, source, env):
    args = source[-1].read()
    dmat, dlabels, dfeatures = unpack_numpy(source[0].rstr())
    fmat, flabels, ffeatures = unpack_numpy(source[1].rstr())
    temp_dfeatures = [(re.match("^[A-Z]+-(.*)_(\=|\+)_.*$", x).group(1), i) for i, x in enumerate(dfeatures)]
    ffeatures = [x.split("_=_")[-1] for x in ffeatures]
    keep = [i for x, i in temp_dfeatures if x in ffeatures]
    newfeatures = sorted([(dfeatures[i], i) for i in keep])
    pack_numpy(target[0].rstr(), data=dmat[:, [i for n, i in newfeatures]], labels=dlabels, features=[n for n, i in newfeatures])
    return None
Beispiel #4
0
def filter_same_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    refmat, reflabels, reffeatures = unpack_numpy(source[1].rstr())
    mat = mat.tocsc()
    keep = [i for i, x in enumerate(features) if x in reffeatures]
    logging.info("keeping %d/%d features", len(keep), len(features))
    newmat = sparse.lil_matrix((len(labels), len(keep)))
    for i, col in enumerate([mat.getcol(i) for i in keep]):
        newmat[:, i] = col
    newfeatures = [features[i] for i in keep]
    pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=newfeatures)
    return None
Beispiel #5
0
def normalize(target, source, env):
    old = numpy.seterr(divide='ignore', invalid='ignore')
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    feature_names = [x.get("_NAME") for x in features]
    feature_distros = [x.split("_=_")[0] for x in feature_names]
    distro_names = set(feature_distros)
    distros = dict([(i, []) for i in distro_names])
    for i, d in enumerate(feature_distros):
        distros[d].append(i)
    logging.info("found %d distribution(s)", len(distros))
    if len(distros) == 0:
        distros = {"ALL" : range(mat.shape[1])}
    C, R, V = [], [], []
    mat = mat.tolil()
    newmat = sparse.lil_matrix(numpy.zeros(shape=mat.shape))
    totals = numpy.empty(shape=(1, len(features)))
    for ri, row in [(i, mat.getrow(i).todense()) for i in range(mat.shape[0])]:
        logging.info("processing object %d/%d", ri, len(labels))
        for di, (name, indices) in enumerate(distros.iteritems()):
            totals[0, indices] = row[:, indices].sum()
        newmat[ri, :] = row / totals
    pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=features)
    numpy.seterr(divide="warn", invalid="warn")
    return None
def add_definitions(target, source, env):
    mat, words, features = unpack_numpy(source[0].rstr())
    fd = meta_open(target[0].rstr(), "w")
    fd.write("<?xml-stylesheet href=\"presentation.xsl\" type=\"text/xsl\" ?><xml>")
    defs = {}
    for elem in [e for e in et.parse(source[1].rstr()).getiterator() if e.tag.endswith("div") and e.attrib.get("type", "") == "entry"]:
        defs[elem.attrib["n"]] = elem #[x.text for x in elem.getiterator() if x.tag.endswith("item")]
    logging.info("loaded %d dictionary entries", len(defs))

    for val, word in zip(mat, words):
        if word in defs:
            lemma = [x for x in defs[word].getiterator() if x.tag.endswith("w")][0].attrib["lemma"]
            fd.write("<w><lemma>%s</lemma><vals>" % lemma.encode("utf-8"))                    
            for h, v in zip(features, val):
                fd.write("<val>%s</val>\n" % h)
            fd.write("</vals><vals>")
            for h, v in zip(features, val):
                fd.write("<val>%f</val>\n" % v)
        
            fd.write("</vals><defs>")
            for d in [x for x in defs[word].getiterator() if x.tag.endswith("item")]:
                fd.write("<def>%s</def>\n" % d.text.encode("utf-8"))
            fd.write("</defs></w>\n")
        

    fd.write("</xml>")
    return None
Beispiel #7
0
def plot_scatter(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    X = mat[:, 0]
    Y = mat[:, 1]
    L = labels
    shapes = ['+', 'o', 's', '^', 'v', '<', '>', 'd', 'p', 'h', '8']
    clustering = {}
    if env.get("CLUSTERS", None):
        for m in re.finditer("^(\d+) (\d+) \(\'?(.*?)\'?\)$", meta_open(env["CLUSTERS"].rstr()).read(), re.M):
            clustering[m.group(3)] = int(m.group(2))
    for x, y, label in zip(X, Y, L):
        try:
            point = pyplot.scatter([x], [y], s=10, c="black", marker=shapes[clustering.get(label, 0)])
        except:
            print x
            sys.exit()
        a = pyplot.annotate(label.title(), (x, y), xytext=(15, 0), textcoords="offset points", fontsize=10, arrowprops=dict(arrowstyle="-"))
    pyplot.xticks([], [])
    pyplot.yticks([], [])
    comps = {}
    xtext = features[0]
    pyplot.xlabel(xtext.rstrip().rstrip(","))
    ytext = features[1]
    pyplot.ylabel(ytext)
    pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight")
    pyplot.cla()
    return None
Beispiel #8
0
def plot_spread(target, source, env):
    args = source[-1].read()
    raw_data = []
    labels = []
    for name, f in zip(args["SUBJECTS"], source):
        mat, l, features = unpack_numpy(f.rstr())
        raw_data.append(sum([[col for c, col in enumerate(row) if c != r] for r, row in enumerate(mat)], []))
        labels.append(name)
    raw_data = numpy.asarray(raw_data)
    #print data.min(), data.max()
    number = 50.0
    rmin, rmax = raw_data.min(), raw_data.max()
    inc = (rmax - rmin) / number
    data = numpy.empty(shape=(len(labels), number))
    for r, label in enumerate(labels):
        data[r, :] = [len([i for i in raw_data[r] if rmin + c * inc < i < rmin + (c + 1) * inc]) for c in range(int(number))]
    #print data.shape
    #pyplot.figure(figsize=(7, 7 * 6))
    for i, ls in enumerate(["-", "--", "-.", ":"]):
        #pyplot.subplot(6, 1, i)
        for name, datum in [x for x in zip(args["SUBJECTS"], data)][i * len(labels) / 4 : (i + 1) * len(labels) / 4]:
            pyplot.plot(datum, label=name, ls=ls)
    inc = (rmax - rmin) / 4.0
    pyplot.xticks([x * (50 / 4) for x in range(5)], ["%.5f" % (rmin + inc * x) for x in range(5)])
    pyplot.xlabel("Jensen-Shannon Divergence")
    pyplot.ylabel("Number of intra-subdomain random sample pairs")
    pyplot.legend(prop={"size" : 6}, ncol=3)
    pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight")

    pyplot.cla()
    return None
Beispiel #9
0
def split_numpy(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    indices = {}
    for pat, fname in zip(args["PATTERNS"], target):
        indices = [i for i, x in enumerate(features) if x.get("_NAME", "_NAME").startswith("%s_" % pat) or x.get("_NAME", "_NAME").startswith("%s-" % pat)]
        pack_numpy(fname.rstr(), data=sparse.coo_matrix(mat.todense()[:, indices]), labels=labels, features=[features[i] for i in indices])
    return None
Beispiel #10
0
def filter_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    for targ, filt in zip(target, args["FILTERS"]):
        indices = [i for i, x in enumerate(features) if x.startswith(filt)]
        newfeatures = [x for x in features if x.startswith(filt)]
        pickle.dump((mat[:, indices], labels, newfeatures), meta_open(targ.rstr(), "w"))
    return None
Beispiel #11
0
def remove_zero_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr(), dense=True)
    mat = numpy.asarray(mat)
    indices = [i for i, x in enumerate(mat.sum(0)) if x > 0]
    mat = mat[:, indices]
    features = [features[i] for i in indices]
    pack_numpy(target[0].rstr(), data=mat, labels=labels, features=features)
    return None
Beispiel #12
0
def compute_svd(target, source, env):
    args = source[-1].read()
    mat, docs, words = unpack_numpy(source[0].rstr(), dense=True)
    #mat = sparse.lil_matrix(mat)
    #mat = mat.asfptype()
    L, S, R = svd(mat, args.get("DIMENSIONS", 300))
    pack_numpy(target[0].rstr(), data=L, labels=docs, features=[{"SVD" : str(i)} for i in range(1, L.shape[1] + 1)])
    pack_numpy(target[1].rstr(), data=R.T, labels=words, features=[{"SVD" : str(i)} for i in range(1, R.shape[0] + 1)])
    return None
def numpy_to_arff(target, source, env):
    mat, labels, features = unpack_numpy(source[0].rstr())
    features = [x.get("_NAME") for x in features]
    fd = weka.Arff()
    for label, values in zip(labels, mat):
        datum = dict(zip(features, values))
        datum.update(label)
        fd.add_datum(datum)
    fd.save(meta_open(target[0].rstr(), "w"))
    return None
Beispiel #14
0
def compute_log_likelihoods(target, source, env):
    args = source[-1].read()
    old = numpy.seterr(divide='ignore', invalid='ignore')
    mat, labels, features = unpack_numpy(source[0].rstr(), dense=False)
    mat = numpy.asarray(mat.todense())
    totals = mat.sum(1)
    logging.info("created totals matrix of shape %s", totals.shape)
    newmat = numpy.empty(shape=mat.shape)
    for i in range(mat.shape[1]):
        newmat[:, i] = ct_log_likelihood(mat[:, i], totals)
    pack_numpy(target[0].rstr(), data=newmat, labels=labels, features=features)
    numpy.seterr(divide="warn", invalid="warn")
    return None
Beispiel #15
0
def compute_pairwise(target, source, env):
    args = source[-1].read()
    if "EQUAL" in env:
        equal = dict([(x, cPickle.load(open(y[0].rstr()))[0]) for x, y in env["EQUAL"].iteritems()])
    old = numpy.seterr(divide='ignore', invalid='ignore')
    mat, labels, features = unpack_numpy(source[0].rstr(), dense=True)
    # feature_names = [x.get("_NAME") for x in features]
    # label_names = [x.get("_NAME") for x in labels]
    # if env.get("DISTROS", True):
    #     distros = {}
    #     for i, feat in enumerate(feature_names):
    #         dist = feat.split("_=_")[0]
    #         if dist not in distros:
    #             distros[dist] = []
    #         distros[dist].append(i)
    #     logging.info("comparing %d distributions", len(distros))
    # else:
    #     distros = {"ALL" : range(len(features))}
    # for k in distros.keys():
    #     distros[k] = mat[:, distros[k]]
    # pairwise_distro_vals = numpy.empty(shape=(len(labels), len(labels), len(distros)))
    # print pairwise_distro_vals.shape
    # distro_order = []
    # for i, (name, distro) in enumerate(distros.iteritems()):
    #     distro_order.append(name.split("-")[-1])
    #     if i % 1000 == 0:        
    #         logging.info("processed distro #%d/%d", i, len(distros))
    # if "WEIGHTS" in args:
    #     ldata, llabels, lfeatures = unpack_numpy(args["WEIGHTS"])
    #     lfeatures = [x.get("_NAME").split("_=_")[-1] for x in lfeatures]
    #     indices = [i for i, x in enumerate(lfeatures) if x in distro_order]
    #     zeros = [i for i, x in enumerate(distro_order) if x not in lfeatures and i <= len(indices) ]
    #     ldata = ldata[:, indices]
    #     ldata = numpy.insert(ldata, zeros + [len(indices) for i in range(len(distro_order) - len(zeros) - len(indices))], 0.0, axis=1)
    #     weights = numpy.empty(shape=pairwise_distro_vals.shape)
    #     for r in range(weights.shape[0]):
    #         for c in range(weights.shape[1]):
    #             weights[r, c, :] = ldata[[r, c], :].sum(0)
    #     res = numpy.average(pairwise_distro_vals, axis=2, weights=weights)
    # else:
    #     res = numpy.average(pairwise_distro_vals, axis=2)
    # newmat = pairwise_distro_vals.sum(2) / pairwise_distro_vals.shape[2]
    # newmat = newmat[[i for i, x in enumerate(labels) if x["_NAME"] not in args.get("FILTER", [])], :][:, [i for i, x in enumerate(labels) if x["_NAME"] not in args.get("FILTER", [])]]
    labels = [x for x in labels if x["_NAME"] not in args.get("FILTER", [])]
    data = env["FUNCTION"](mat)
    pack_numpy(target[0].rstr(), data=data, labels=labels, features=labels)
    numpy.seterr(divide="warn", invalid="warn")
    return None
Beispiel #16
0
def plot_dendrogram(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    roworder = [i for i, f in enumerate(sorted(labels)) if "LABEL_FILTER" not in env or f in env["LABEL_FILTER"]]
    colorder = [i for i, f in enumerate(sorted(features)) if "FEATURE_FILTER" not in env or f in env["FEATURE_FILTER"]]
    mat = mat[roworder][:, colorder][0:50]
    if args.get("DIRECT", False):
        Y = squareform(mat, checks=False)
    else:
        Y = pdist(mat, "cosine")
    Z = abs(linkage(Y, source[-1].read().get("LINKAGE", 'average')))
    dendrogram(Z, labels=labels, orientation="right", color_threshold=-1, leaf_font_size="xx-small")
    pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight")
    pyplot.cla()
    pyplot.clf()
    return None
Beispiel #17
0
def filter_counts(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    mat = mat.tocsc()
    totals = mat.sum(0)
    if "THRESHOLD" in args:
        keep = [i for i, x in enumerate(totals.T) if x >= args["THRESHOLD"]]
    elif "NUMBER" in args:
        keep = [j for y, j in sorted([(x, i) for i, x in enumerate(totals.T)], reverse=True)[0:args["NUMBER"]]]
        pass
    logging.info("keeping %d/%d features", len(keep), len(totals.T))
    mat = mat.tocsc()
    newmat = mat[:, keep]
    newfeatures = [features[i] for i in keep]
    pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=newfeatures)
    return None
Beispiel #18
0
def log_likelihood(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    newmat = numpy.empty(mat.shape)
    obs_totals = mat.sum(1)
    feat_totals = mat.sum(0)
    total = feat_totals.sum()
    for col, feature in enumerate(features):
        for row, label in enumerate(labels):
            ll = ct_log_likelihood([
                (mat[row][col], obs_totals[row]),
                (feat_totals[col] - mat[row][col], total - obs_totals[row])
                ])
            newmat[row][col] = ll
    #pickle.dump((newmat, labels, features), meta_open(target[0].rstr(), "w"))
    return None
Beispiel #19
0
def plot_distros(target, source, env):
    """
    Plots a matrix of distributions (rows) over features (columns)

    N per figure
    """
    N = 5
    args = source[-1].read()
    distros, labels, features = unpack_numpy(source[0].rstr(), dense=True)
    keep = [i for i, x in enumerate(distros) if x.sum() > 10]
    distros = distros[keep, :]
    labels = [labels[i] for i in keep]

    distros = numpy.transpose(distros.T / distros.sum(1))
    #print distros[0]
    features = [x["_NAME"] for x in features]
    labels = [x["_NAME"] for x in labels]
    num_figs = 1 + int(len(labels) / N)
    pyplot.figure(figsize=(7, 7 * num_figs))
    allvals = distros.sum(0) / distros.sum()
    order = [x[1] for x in sorted([(y, i) for i, y in enumerate(allvals) if y > 0], reverse=True)]
    for i in range(num_figs):
        start = N * i
        end = min(len(labels), N * (i + 1))
        pyplot.subplot(num_figs, 1, i + 1)
        if args.get("GRAYSCALE"):
            pyplot.bar(left=[i for i in range(len(order))], height=[allvals[i] for i in order], label="Average", width=1.0 / num_figs)
            for j, (name, vals) in enumerate([x for x in zip(labels[start : end], distros[start : end])]): # if x[1].sum() > 0]:
                total = vals.sum()
                vals = vals / vals.sum()
                pyplot.bar(left=[i + j * (1.0 / num_figs)  for i in range(len(order))], height=[vals[i] for i in order], label="%s" % name, width=1.0 / num_figs)
        else:
            pyplot.plot([allvals[i] for i in order], label="Average", lw=2)
            for name, vals in [x for x in zip(labels[start : end], distros[start : end])]: # if x[1].sum() > 0]:
                total = vals.sum()
                vals = vals / vals.sum()
                pyplot.plot([vals[i] for i in order], label="%s" % (name))
                pyplot.legend(prop={"size" : 8})
                pyplot.xticks(range(len(order)), [features[i].strip() for i in order], rotation=45, fontsize=6, ha="right")
                pyplot.gca().grid(color='lightgrey', linestyle='-', linewidth=1)
        pyplot.legend(prop={"size" : 8})
        pyplot.xticks(range(len(order)), [features[i].strip() for i in order], rotation=45, fontsize=6, ha="right")
        pyplot.gca().grid(color='lightgrey', linestyle='-', linewidth=1)
    pyplot.savefig(target[0].rstr(), bbox_inches="tight")
    
    pyplot.cla()
    return None
Beispiel #20
0
def compute_lsa(target, source, env):
    args = source[-1].read()
    logging.info("starting...")
    mat, labels, features = unpack_numpy(source[0].rstr())
    dimensions = args.get("DIMENSIONS", mat.shape[1])
    matrix = mat[:, range(dimensions)]
    items = labels
    K = args.get("CLUSTERS", len(items) / 5)
    if args.get("METHOD", None) == "kmeans":
        logging.info("extracting %d clusters from %d singular values in term matrix of order %s", K, dimensions, matrix.shape)
        whitened = whiten(matrix)
        book = numpy.array((whitened[0], whitened[2]))
        codebook, distortion = kmeans(whitened, K)
        pickle.dump((items, codebook, distortion, whitened), meta_open(target[0].rstr(), "w"))
    elif args.get("METHOD", None) == "hierarchical":
        pass
    return None
Beispiel #21
0
def combine(target, source, env):
    files = [unpack_numpy(x.rstr(), dense=False) for x in source]
    all_labels = []
    all_features = [dict(y) for y in sorted(set(sum([[tuple(i.iteritems()) for i in x[2]] for x in files], [])))]
    feature_mapping = dict([(tuple(b.iteritems()), a) for a, b in enumerate(all_features)])
    R, C, V = [], [], []
    label_offset = 0
    for mat, labels, features in files:
        all_labels += [tuple(x.iteritems()) for x in labels]
        features_tuples = [tuple(x.iteritems()) for x in features]
        for row, col, val in zip(*sparse.find(mat)):
            R.append(row + label_offset)
            C.append(feature_mapping[features_tuples[col]])
            V.append(val)
        label_offset += mat.shape[0]
    newmat = sparse.coo_matrix((V, (R, C)), shape=(len(all_labels), len(all_features)))
    pack_numpy(target[0].rstr(), data=newmat, labels=[dict(x) for x in all_labels], features=all_features)
    return None
Beispiel #22
0
def sum_over_labels(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    labels = [tuple(x.iteritems()) for x in labels]
    all_labels = sorted(set(labels))
    label_mapping = dict([(b, a) for a, b in enumerate(all_labels)])
    total = len(sparse.find(mat)[0])
    if args.get("SINGLE", None):
        newlabels = [args.get("SINGLE", "?")]
        newmat = mat.sum(0)
    else:
        R, C, V = [], [], []
        for row, col, val in zip(*sparse.find(mat)):
            R.append(label_mapping[labels[row]])
            C.append(col)
            V.append(val)
    newmat = sparse.coo_matrix((V, (R, C)), shape=(len(all_labels), len(features)))
    pack_numpy(target[0].rstr(), data=newmat, labels=[dict(x) for x in all_labels], features=features)
    return None
Beispiel #23
0
def pairwise_log_likelihood(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    combos = list(set([frozenset([x, y]) for x in labels for y in labels if x != y]))
    newmat = numpy.empty((len(combos), len(features)))
    obs_totals = mat.sum(1)
    for row, combo in enumerate(combos):
        for col, feature in enumerate(features):
            rowA = labels.index(list(combo)[0])
            rowB = labels.index(list(combo)[1])
            if mat[rowA][col] == 0 and mat[rowB][col] == 0:
                ll = 0.0
            else:
                ll = ct_log_likelihood([
                    (mat[rowA][col], obs_totals[rowA]),
                    (mat[rowB][col], obs_totals[rowB])
                    ])
            newmat[row][col] = ll
    #pickle.dump((newmat, combos, [x for x in features]), meta_open(target[0].rstr(), "w"))
    return None
def run_lda(target, source, env):
    logging.info("loading docs...")
    args = source[-1].read()
    mat, labels, words = unpack_numpy(source[0].rstr(), dense=True)
    data = [[] for x in range(mat.shape[0])]
    for i, w in enumerate(words):
        lemma = w["_NAME"]
        for j in range(len(data)):
            data[j] += [lemma.encode("utf-8") for x in range(mat[j, i])]

    docs = graphmod.DocumentCollection([x for x in data if len(x) > 0])
    model = graphmod.LDA(args.get("topics", 100), args.get("alpha", .01), args.get("beta", .5))
    model.load_docs(docs)
    for i in range(1, args.get("iterations", 1000) + 1):
        logging.info("iteration %d/%d" % (i, args.get("iterations", 1000)))
        model.sample()
    
    assigns = model.get_assignments()
    topics = model.get_topic_word_counts()
    #print topics.shape, mat.shape
    pickle.dump((mat, labels, words, assigns, topics), meta_open(target[0].rstr(), "w"))
    return None
Beispiel #25
0
def merge_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    mat = mat.todense()
    merge = numpy.load(source[1].rstr())
    if "cluster" in merge:
        newmat = numpy.empty(shape=(len(labels), merge["centers"].shape[0] + 1))
        feature_map = dict([(k.get("_NAME"), v) for k, v in zip(merge["labels"], merge["cluster"])])
        clusters = [[] for i in range(merge["centers"].shape[0] + 1)]
        for i, f in enumerate(features):
            clusters[feature_map.get(f.get("_NAME"), -1)].append(i)
        for i in range(len(clusters)):
            newmat[:, i] = mat[:, clusters[i]].sum(1)
        newfeatures = [str(i) for i in range(len(clusters))]
    else:
        newmat = sparse.lil_matrix(numpy.empty(shape=(len(labels), len(merge))))
        features = [f.get("_NAME") for f in features]
        newfeatures = ["VCLUST_=_%s" % str(i) for i in range(len(merge))]
        for i, cluster in enumerate(merge):
            newmat[:, i] = mat[:, [features.index(f) for f in cluster if f in features]].sum(1)
    pack_numpy(target[0].rstr(), data=sparse.coo_matrix(newmat), labels=labels, features=newfeatures)
    return None
Beispiel #26
0
def npz_to_df(fname, rzero=True, transform=None, labelname="_NAME", featurename="_NAME", keep_labels=None, keep_features=None, transpose=False):
    mat, labels, features = unpack_numpy(fname)
    if sparse.issparse(mat):
        mat = numpy.asarray(mat.todense())
    if transform:
        mat = transform(mat)
    if keep_labels:
        indices = [i for i, x in enumerate(labels) if x.get(labelname) in keep_labels]
        mat = mat[indices, :]
        labels = [labels[i] for i in indices]
    if keep_features:
        indices = [i for i, x in enumerate(features) if x[featurename] in keep_features]
        mat = mat[indices, :]
        features = [features[i] for i in indices]
    if transpose:
        mat = r["as.data.frame"](mat.T)
        mat.colnames = [x.get(labelname, i) for i, x in enumerate(labels)]
        mat.rownames = [x.get(featurename, i) for i, x in enumerate(features)]
    else:
        mat = r["as.data.frame"](mat)
        mat.rownames = [x.get(labelname, i) for i, x in enumerate(labels)]
        mat.colnames = [x.get(featurename, i) for i, x in enumerate(features)]
    return mat
Beispiel #27
0
def compare_distros(target, source, env):
    old = numpy.seterr(divide='ignore', invalid='ignore')
    mat, labels, features = unpack_numpy(source[0].rstr())
    labels = [x.get("LABEL_filename") for x in labels]
    features = [x.get("_NAME") for x in features]
    over = set([x.split("_=_")[-1] for x in features])
    mat = mat.todense()
    allverbs = [re.match("^[A-Z]+-(.*)_._.*$", x).group(1) for x in features]
    verbs = dict([(x, {"features" : [], "data" : {}}) for x in set(allverbs)])
    for i, v in enumerate(allverbs):
        ov = features[i].split("_=_")[-1]
        verbs[v]["data"][ov] = mat[:, i].T.tolist()[0]
    logging.info("comparing %s distributions", len(verbs))
    results = []
    distros = []
    verblist = []
    for verb, vals in [x for x in verbs.iteritems() if len(x[1]) > 1]: 
        distros.append(numpy.asarray([vals["data"].get(x, [0 for i in range(len(labels))]) for x in over]).T)
        #try:
        #    numpy.asarray([vals["data"].get(x, [0 for i in range(len(labels))]) for x in over])
        #except:
        #    print [vals["data"].get(x, [0 for i in range(len(labels))]) for x in over]
        #    sys.exit()
        jsd = jensen_shannon_divergence(distros[-1], counts=True)
        results.append(jsd)
        verblist.append(verb)
    distros = numpy.asarray(distros)
    logging.info("built distro matrix of shape %s", distros.shape)
    numpy.seterr(divide="warn", invalid="warn")
    numpy.savez(target[0].rstr(),
                results=results,
                distros=distros, #[sparse.lil_matrix(distros[:, :, i]) for i in range(distros.shape[-1])],
                verbs=verblist,
                labels=labels,
                features=list(over))
    return None
Beispiel #28
0
def update_numpy_file(target, source, env):
    data, labels, features = unpack_numpy(source[0].rstr())
    data = sparse.coo_matrix(data)
    pack_numpy(target[0].rstr(), data=data, labels=labels, features=features)
    return None
Beispiel #29
0
parser.add_option("-c", dest="cols", action="append", default=[])
parser.add_option("-o", dest="output", action="append", default=[])
parser.add_option("-i", dest="input", action="append", default=[])
parser.add_option("-t", dest="type", type="choice", choices=["sum_rows", "sum_cols", "print", "free"], default="print")
options, args = parser.parse_args()

logging.basicConfig(level = 0, format="%(asctime)s - %(message)s", filemode="w")

for fname in options.input:
    if options.type == "free":
        fd = numpy.load(fname)
        for k in fd.keys():
            print k, type(fd[k]), fd[k].shape
        continue

    mat, labels, features = unpack_numpy(fname, dense=True, oldstyle=True)
    labels = [x["_NAME"] for x in labels]
    features = [x["_NAME"] for x in features]
    
    row_indices = range(min(10, len(labels)))
    col_indices = range(min(10, len(features)))
    if options.rows:
        row_indices = [i for i, x in enumerate(labels) if any([re.match(rx, x) for rx in options.rows])]
    if options.cols:
        col_indices = [i for i, x in enumerate(features) if any([re.match(rx, x) for rx in options.cols])]

    print "\t".join([labels[i] for i in row_indices])
    print "\t".join([features[i] for i in col_indices])

    if options.type == "sum_cols":
        print mat[row_indices, :][:, col_indices].sum(1)
Beispiel #30
0
def plot_heatmap(target, source, env):
    """
    For an Arff file with N instances of M numeric features, plot an NxM matrix of cells
    where a cell's intensity is determined by the feature's value in that instance.
    """
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    labels = [x.get(args.get("LABEL_NAME", "_NAME")) for x in labels]
    features = [x.get(args.get("FEATURE_NAME", "_NAME")) for x in features]
    samples = {}
    for sub, fname in args.get("sampled", {}).iteritems():
        smat, slabels, sfeatures = unpack_numpy(fname)
        samples[sub] = sum([[smat[x, y] for x in range(smat.shape[0]) if x != y] for y in range(smat.shape[1])], [])
    roworder = [i for f, i in sorted([(f, i) for i, f in enumerate(labels)]) if "LABEL_FILTER" not in env or f in env["LABEL_FILTER"]]
    colorder = [i for f, i in sorted([(f, i) for i, f in enumerate(features)]) if "FEATURE_FILTER" not in env or f in env["FEATURE_FILTER"]]
    fig = pyplot.figure(figsize=(20, 20))
    C = []
    for row, datum in enumerate([mat[i] for i in roworder]):
        temp = []
        for col, val in enumerate([datum[i] for i in colorder]):
            if row == col and "sampled" in args:
                sub = sorted(samples)[row]
                temp.append(env.get("TRANSFORM", lambda x : x)(sum(samples[sub]) / float(len(samples[sub]))))
            elif row == col and "reflect" in args:
                temp.append(0.0)
            elif row - col > 0:
                temp.append(env.get("TRANSFORM", lambda x : x)(val))
            elif "sampled" in args:
                lab1 = sorted(labels)[row]
                lab2 = sorted(labels)[col]
                sub1 = float(len([i for i in samples[lab1] if i < val])) / len(samples[lab1])
                sub2 = float(len([i for i in samples[lab2] if i < val])) / len(samples[lab2])
                temp.append((sub1 + sub2) / 2.0)
            elif "reflect" in args:
                temp.append(env.get("TRANSFORM", lambda x : x)(val))
            else:
                temp.append(1.0)
        C.append(temp)
    C = numpy.asarray(C)
    pyplot.gray()
    top = numpy.triu(C, 1)
    bottom = numpy.tril(C)
    rbottom = numpy.tril(C)
    if "sampled" in args:
        bottom = bottom / bottom.max()
    C_scaled = top + bottom
    pyplot.pcolor(C_scaled, vmin=0.0, vmax=1.0)

#    pyplot.title("Lighter indicates higher JSD/significance\nDiagonal shows homogeny from random samples\n\n\nJensen-Shannon Divergence (not comparable across features)\nValues range from %f (black) to %f (white)" % (min([rbottom[x[0], x[1]] for x in zip(*rbottom.nonzero())]), rbottom.max()))
    if "sampled" in args:
        pyplot.text(C.shape[1] + .1, C.shape[0] / 2, "Significance score (comparable across features), values range from 0%% (black) to 100%% (white)", va="center", rotation=270, fontsize=25)
    A = numpy.asarray(C)
    fs = 250 / len(colorder)
    for y in range(A.shape[0]):
        for x in range(A.shape[1]):
            if x < y or "sampled" in args or "reflect" in args:
                if A[y, x] < .5:
                    c = "white"
                else:
                    c = "black"
                pyplot.text(x + .05, y + .25, "%.1e" % (A[y, x]), color=c, fontsize=fs)
    if "sampled" not in args and "reflect" not in args:
        pyplot.text(A.shape[0] / 1.5, A.shape[1] / 2.5, "N/A", fontsize=50)
    fs = 600 / len(colorder)
    pyplot.xticks([x + .7 for x in range(len(colorder))], sorted([labels[x].title() for x in colorder]), rotation=45, ha="right", fontsize=fs)
    pyplot.yticks([x + .5 for x in range(len(roworder))], sorted([labels[x].title() for x in roworder]), fontsize=fs)
    pyplot.title("Jensen-Shannon Divergence", fontsize=25)
    #pyplot.title(args.get("title", ""), fontsize=20)
    #pyplot.text(40, 19, "Significance score, ranges from 0%% (black) to 100%% (white)", rotation=270, ha="center", va="center")
    pyplot.axes().autoscale_view()
    pyplot.savefig(target[0].rstr(), dpi=100, bbox_inches="tight")
    pyplot.cla()
    pyplot.clf()
    return None