Ejemplo n.º 1
0
def numpy_file(target, source, env):
    args = source[-1].read()
    rows = []
    for text, label in sum([env["ARGS"]["SPLIT"](fd.rstr()) for fd in source[0:-1]], []):
        feats = {}
        for extractor in [env["ARGS"]["EXTRACT"]]:
            feats.update(extractor(text))
        rows.append((label.items(), feats))

    features = sorted(set(sum([x[1].keys() for x in rows], [])))
    labels = []
    if args.get("SPARSE", False):
        logging.info("creating sparse file")
        R, C, V = [], [], []
        for row, (name, vals) in enumerate(rows):
            labels.append(name) #"_".join([name[x] for x in source[-1].read()["LABEL"]]))
            new_vals = [(row, col, vals[feature]) for col, feature in enumerate(features) if feature in vals]
            R += [x[0] for x in new_vals]
            C += [x[1] for x in new_vals]
            V += [x[2] for x in new_vals]
        mat = sparse.coo_matrix((V, (R, C)), shape=(len(labels), len(features)))
    else:
        mat = numpy.zeros((len(rows), len(features)))
        for row, (name, vals) in enumerate(rows):
            labels.append(name) #"_".join([name[x] for x in source[-1].read()["LABEL"]]))
            new_vals = [(row, col, vals[feature]) for col, feature in enumerate(features) if feature in vals]
            for row, col, val in new_vals:
                mat[row, col] = val
    logging.info("created %dx%d data file", len(labels), len(features))
    pack_numpy(target[0].rstr(), data=mat, labels=labels, features=features)
    return None
Ejemplo n.º 2
0
def project_svd(target, source, env):
    args = source[-1].read()
    data, labels, features = unpack_numpy(source[0].rstr(), dense=False)
    proj, plabels, newfeatures = unpack_numpy(source[1].rstr(), dense=False)
    newdata = data * proj
    pack_numpy(target[0].rstr(), data=newdata, labels=labels, features=newfeatures)
    return None
Ejemplo n.º 3
0
def normalize(target, source, env):
    old = numpy.seterr(divide='ignore', invalid='ignore')
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    feature_names = [x.get("_NAME") for x in features]
    feature_distros = [x.split("_=_")[0] for x in feature_names]
    distro_names = set(feature_distros)
    distros = dict([(i, []) for i in distro_names])
    for i, d in enumerate(feature_distros):
        distros[d].append(i)
    logging.info("found %d distribution(s)", len(distros))
    if len(distros) == 0:
        distros = {"ALL" : range(mat.shape[1])}
    C, R, V = [], [], []
    mat = mat.tolil()
    newmat = sparse.lil_matrix(numpy.zeros(shape=mat.shape))
    totals = numpy.empty(shape=(1, len(features)))
    for ri, row in [(i, mat.getrow(i).todense()) for i in range(mat.shape[0])]:
        logging.info("processing object %d/%d", ri, len(labels))
        for di, (name, indices) in enumerate(distros.iteritems()):
            totals[0, indices] = row[:, indices].sum()
        newmat[ri, :] = row / totals
    pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=features)
    numpy.seterr(divide="warn", invalid="warn")
    return None
Ejemplo n.º 4
0
def split_numpy(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    indices = {}
    for pat, fname in zip(args["PATTERNS"], target):
        indices = [i for i, x in enumerate(features) if x.get("_NAME", "_NAME").startswith("%s_" % pat) or x.get("_NAME", "_NAME").startswith("%s-" % pat)]
        pack_numpy(fname.rstr(), data=sparse.coo_matrix(mat.todense()[:, indices]), labels=labels, features=[features[i] for i in indices])
    return None
Ejemplo n.º 5
0
def compute_svd(target, source, env):
    args = source[-1].read()
    mat, docs, words = unpack_numpy(source[0].rstr(), dense=True)
    #mat = sparse.lil_matrix(mat)
    #mat = mat.asfptype()
    L, S, R = svd(mat, args.get("DIMENSIONS", 300))
    pack_numpy(target[0].rstr(), data=L, labels=docs, features=[{"SVD" : str(i)} for i in range(1, L.shape[1] + 1)])
    pack_numpy(target[1].rstr(), data=R.T, labels=words, features=[{"SVD" : str(i)} for i in range(1, R.shape[0] + 1)])
    return None
Ejemplo n.º 6
0
def remove_zero_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr(), dense=True)
    mat = numpy.asarray(mat)
    indices = [i for i, x in enumerate(mat.sum(0)) if x > 0]
    mat = mat[:, indices]
    features = [features[i] for i in indices]
    pack_numpy(target[0].rstr(), data=mat, labels=labels, features=features)
    return None
Ejemplo n.º 7
0
def feature_union(target, source, env):
    args = source[-1].read()
    dmat, dlabels, dfeatures = unpack_numpy(source[0].rstr())
    fmat, flabels, ffeatures = unpack_numpy(source[1].rstr())
    temp_dfeatures = [(re.match("^[A-Z]+-(.*)_(\=|\+)_.*$", x).group(1), i) for i, x in enumerate(dfeatures)]
    ffeatures = [x.split("_=_")[-1] for x in ffeatures]
    keep = [i for x, i in temp_dfeatures if x in ffeatures]
    newfeatures = sorted([(dfeatures[i], i) for i in keep])
    pack_numpy(target[0].rstr(), data=dmat[:, [i for n, i in newfeatures]], labels=dlabels, features=[n for n, i in newfeatures])
    return None
Ejemplo n.º 8
0
def compute_log_likelihoods(target, source, env):
    args = source[-1].read()
    old = numpy.seterr(divide='ignore', invalid='ignore')
    mat, labels, features = unpack_numpy(source[0].rstr(), dense=False)
    mat = numpy.asarray(mat.todense())
    totals = mat.sum(1)
    logging.info("created totals matrix of shape %s", totals.shape)
    newmat = numpy.empty(shape=mat.shape)
    for i in range(mat.shape[1]):
        newmat[:, i] = ct_log_likelihood(mat[:, i], totals)
    pack_numpy(target[0].rstr(), data=newmat, labels=labels, features=features)
    numpy.seterr(divide="warn", invalid="warn")
    return None
Ejemplo n.º 9
0
def filter_same_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    refmat, reflabels, reffeatures = unpack_numpy(source[1].rstr())
    mat = mat.tocsc()
    keep = [i for i, x in enumerate(features) if x in reffeatures]
    logging.info("keeping %d/%d features", len(keep), len(features))
    newmat = sparse.lil_matrix((len(labels), len(keep)))
    for i, col in enumerate([mat.getcol(i) for i in keep]):
        newmat[:, i] = col
    newfeatures = [features[i] for i in keep]
    pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=newfeatures)
    return None
Ejemplo n.º 10
0
def compute_pairwise(target, source, env):
    args = source[-1].read()
    if "EQUAL" in env:
        equal = dict([(x, cPickle.load(open(y[0].rstr()))[0]) for x, y in env["EQUAL"].iteritems()])
    old = numpy.seterr(divide='ignore', invalid='ignore')
    mat, labels, features = unpack_numpy(source[0].rstr(), dense=True)
    # feature_names = [x.get("_NAME") for x in features]
    # label_names = [x.get("_NAME") for x in labels]
    # if env.get("DISTROS", True):
    #     distros = {}
    #     for i, feat in enumerate(feature_names):
    #         dist = feat.split("_=_")[0]
    #         if dist not in distros:
    #             distros[dist] = []
    #         distros[dist].append(i)
    #     logging.info("comparing %d distributions", len(distros))
    # else:
    #     distros = {"ALL" : range(len(features))}
    # for k in distros.keys():
    #     distros[k] = mat[:, distros[k]]
    # pairwise_distro_vals = numpy.empty(shape=(len(labels), len(labels), len(distros)))
    # print pairwise_distro_vals.shape
    # distro_order = []
    # for i, (name, distro) in enumerate(distros.iteritems()):
    #     distro_order.append(name.split("-")[-1])
    #     if i % 1000 == 0:        
    #         logging.info("processed distro #%d/%d", i, len(distros))
    # if "WEIGHTS" in args:
    #     ldata, llabels, lfeatures = unpack_numpy(args["WEIGHTS"])
    #     lfeatures = [x.get("_NAME").split("_=_")[-1] for x in lfeatures]
    #     indices = [i for i, x in enumerate(lfeatures) if x in distro_order]
    #     zeros = [i for i, x in enumerate(distro_order) if x not in lfeatures and i <= len(indices) ]
    #     ldata = ldata[:, indices]
    #     ldata = numpy.insert(ldata, zeros + [len(indices) for i in range(len(distro_order) - len(zeros) - len(indices))], 0.0, axis=1)
    #     weights = numpy.empty(shape=pairwise_distro_vals.shape)
    #     for r in range(weights.shape[0]):
    #         for c in range(weights.shape[1]):
    #             weights[r, c, :] = ldata[[r, c], :].sum(0)
    #     res = numpy.average(pairwise_distro_vals, axis=2, weights=weights)
    # else:
    #     res = numpy.average(pairwise_distro_vals, axis=2)
    # newmat = pairwise_distro_vals.sum(2) / pairwise_distro_vals.shape[2]
    # newmat = newmat[[i for i, x in enumerate(labels) if x["_NAME"] not in args.get("FILTER", [])], :][:, [i for i, x in enumerate(labels) if x["_NAME"] not in args.get("FILTER", [])]]
    labels = [x for x in labels if x["_NAME"] not in args.get("FILTER", [])]
    data = env["FUNCTION"](mat)
    pack_numpy(target[0].rstr(), data=data, labels=labels, features=labels)
    numpy.seterr(divide="warn", invalid="warn")
    return None
Ejemplo n.º 11
0
def filter_counts(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    mat = mat.tocsc()
    totals = mat.sum(0)
    if "THRESHOLD" in args:
        keep = [i for i, x in enumerate(totals.T) if x >= args["THRESHOLD"]]
    elif "NUMBER" in args:
        keep = [j for y, j in sorted([(x, i) for i, x in enumerate(totals.T)], reverse=True)[0:args["NUMBER"]]]
        pass
    logging.info("keeping %d/%d features", len(keep), len(totals.T))
    mat = mat.tocsc()
    newmat = mat[:, keep]
    newfeatures = [features[i] for i in keep]
    pack_numpy(target[0].rstr(), data=newmat.tocoo(), labels=labels, features=newfeatures)
    return None
Ejemplo n.º 12
0
def combine(target, source, env):
    files = [unpack_numpy(x.rstr(), dense=False) for x in source]
    all_labels = []
    all_features = [dict(y) for y in sorted(set(sum([[tuple(i.iteritems()) for i in x[2]] for x in files], [])))]
    feature_mapping = dict([(tuple(b.iteritems()), a) for a, b in enumerate(all_features)])
    R, C, V = [], [], []
    label_offset = 0
    for mat, labels, features in files:
        all_labels += [tuple(x.iteritems()) for x in labels]
        features_tuples = [tuple(x.iteritems()) for x in features]
        for row, col, val in zip(*sparse.find(mat)):
            R.append(row + label_offset)
            C.append(feature_mapping[features_tuples[col]])
            V.append(val)
        label_offset += mat.shape[0]
    newmat = sparse.coo_matrix((V, (R, C)), shape=(len(all_labels), len(all_features)))
    pack_numpy(target[0].rstr(), data=newmat, labels=[dict(x) for x in all_labels], features=all_features)
    return None
Ejemplo n.º 13
0
def sum_over_labels(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    labels = [tuple(x.iteritems()) for x in labels]
    all_labels = sorted(set(labels))
    label_mapping = dict([(b, a) for a, b in enumerate(all_labels)])
    total = len(sparse.find(mat)[0])
    if args.get("SINGLE", None):
        newlabels = [args.get("SINGLE", "?")]
        newmat = mat.sum(0)
    else:
        R, C, V = [], [], []
        for row, col, val in zip(*sparse.find(mat)):
            R.append(label_mapping[labels[row]])
            C.append(col)
            V.append(val)
    newmat = sparse.coo_matrix((V, (R, C)), shape=(len(all_labels), len(features)))
    pack_numpy(target[0].rstr(), data=newmat, labels=[dict(x) for x in all_labels], features=features)
    return None
Ejemplo n.º 14
0
def merge_features(target, source, env):
    args = source[-1].read()
    mat, labels, features = unpack_numpy(source[0].rstr())
    mat = mat.todense()
    merge = numpy.load(source[1].rstr())
    if "cluster" in merge:
        newmat = numpy.empty(shape=(len(labels), merge["centers"].shape[0] + 1))
        feature_map = dict([(k.get("_NAME"), v) for k, v in zip(merge["labels"], merge["cluster"])])
        clusters = [[] for i in range(merge["centers"].shape[0] + 1)]
        for i, f in enumerate(features):
            clusters[feature_map.get(f.get("_NAME"), -1)].append(i)
        for i in range(len(clusters)):
            newmat[:, i] = mat[:, clusters[i]].sum(1)
        newfeatures = [str(i) for i in range(len(clusters))]
    else:
        newmat = sparse.lil_matrix(numpy.empty(shape=(len(labels), len(merge))))
        features = [f.get("_NAME") for f in features]
        newfeatures = ["VCLUST_=_%s" % str(i) for i in range(len(merge))]
        for i, cluster in enumerate(merge):
            newmat[:, i] = mat[:, [features.index(f) for f in cluster if f in features]].sum(1)
    pack_numpy(target[0].rstr(), data=sparse.coo_matrix(newmat), labels=labels, features=newfeatures)
    return None
Ejemplo n.º 15
0
def subsample_numpy_file(target, source, env):
    args = source[-1].read()
    labels = ["sample %d" % x for x in range(1, args["COUNT"] + 1)]
    items = []
    features = [] #set()
    item = {}
    for i, m in enumerate(re.finditer("\n\s*\n(.*?)(^<c>.*?)$", meta_open(source[0].rstr()).read(), re.M | re.S)):
        s = m.group(1) + m.group(2)
        for extractor in env["FEATURE_EXTRACTORS"]:
            feats = extractor(s)
            features += feats.keys() #features.union(feats.keys())
            for k, v in feats.iteritems():
                item[k] = item.get(k, 0.0) + v
        if i % args["WINDOW"] == 0 and i != 0:
            logging.info("made item #%d", len(items))
            items.append(item)
            item = {}
        if i % 100000 == 0 and i != 0:
            features = list(set(features))
    features = dict([(y, i) for i, y in enumerate(sorted(set(features)))])
    assignments = [0 for i in range(len(items) - args.get("ITEMS", 1))] + \
                  [1 for i in range(args.get("ITEMS", 1))]
    vals = []
    for r in range(args["COUNT"]):
        logging.info("building random sample #%d", (r + 1))
        random.shuffle(assignments)
        ritems = [items[i] for i, x in enumerate(assignments) if x == 1]
        #curfeats = set(sum([items[i].keys() for i, y in enumerate(assignments) if y == 1], []))
        curfeats = set(sum([i.keys() for i in ritems], []))
        logging.info("%d features", len(curfeats))
        #sample = dict([(f, sum([items[i].get(f, 0) for i, y in enumerate(assignments) if y == 1])) for f in curfeats])
        sample = dict([(f, sum([i.get(f, 0) for i in ritems])) for f in curfeats])
        vals += [(sample[f], r, features[f]) for f in curfeats]
    V, R, C = zip(*vals)
    mat = sparse.coo_matrix((V, (R, C)), shape=(len(labels), len(features)))
    logging.info("created %dx%d data file", len(labels), len(features))
    pack_numpy(target[0].rstr(), data=mat, labels=labels, features=sorted(features.keys()))
    return None
Ejemplo n.º 16
0
def update_numpy_file(target, source, env):
    data, labels, features = unpack_numpy(source[0].rstr())
    data = sparse.coo_matrix(data)
    pack_numpy(target[0].rstr(), data=data, labels=labels, features=features)
    return None