Beispiel #1
0
 def test_ica(self):
     ica_data = os.path.join(DATA_DIR, "ica.txt")
     ica_results = os.path.join(DATA_DIR, "results/ica")
     data = parse(self.sc.textFile(ica_data), "raw")
     w, sigs = ica(data, 4, 4, svdmethod="direct", seed=1)
     w_true = loadmat(os.path.join(ica_results, "w.mat"))["w"]
     sigs_true = loadmat(os.path.join(ica_results, "sigs.mat"))["sigs"]
     tol = 10e-02
     assert(allclose(w, w_true, atol=tol))
     assert(allclose(transpose(sigs.collect()), sigs_true, atol=tol))
Beispiel #2
0
    method = SigProcessingMethod.load("stats", statistic=statistic)
    vals = method.calc(data)

    return vals

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="compute summary statistics on time series data")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("mode", choices=("mean", "median", "std", "norm"),
                        help="which summary statistic")
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "ref", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    vals = stats(data, args.mode)

    outputdir = args.outputdir + "-stats",

    outputdir = args.outputdir + "-stats"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(vals, outputdir, "stats_" + args.mode, "matlab")
Beispiel #3
0
    normDists = data.map(lambda p: closestPoint((p - mean(p)) / norm(p), centers, "corr")[1])

    return labels, centers, dists, normDists


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do kmeans clustering")
    parser.add_argument("master", type=str)
    parser.add_argument("dataFile", type=str)
    parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing")
    parser.add_argument("outputDir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("dist", choices=("euclidean", "correlation"), help="distance metric for kmeans")

    args = parser.parse_args()
    egg = glob.glob(os.environ["THUNDER_EGG"] + "*.egg")
    sc = SparkContext(args.master, "kmeans", pyFiles=egg)

    lines = sc.textFile(args.dataFile)
    data = parse(lines, args.dataMode).cache()

    labels, centers, dists, normDists = kmeans(data, args.k, args.dist)

    outputDir = args.outputDir + "-kmeans"
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)
    saveout(labels, outputDir, "labels", "matlab")
    saveout(dists, outputDir, "dists", "matlab")
    saveout(centers, outputDir, "centers", "matlab")
    saveout(normDists, outputDir, "normDists", "matlab")
Beispiel #4
0
def get_data_query(self):
    return parse(self.sc.textFile(FISH_DATA), "dff", "linear", None, [88, 76]).cache()
Beispiel #5
0
def get_data_ref(self):
    return parse(self.sc.textFile(FISH_DATA), "raw", "xyz").cache()
Beispiel #6
0
if len(argsIn) < 5:
    print >> sys.stderr, "usage: ica <master> <inputFile> <outputFile> <k> <c>"
    exit(-1)

# parse inputs
sc = SparkContext(argsIn[0], "ica")
dataFile = str(argsIn[1])
outputDir = str(argsIn[2]) + "-ica"
k = int(argsIn[3])
c = int(argsIn[4])
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

# load data
lines = sc.textFile(dataFile)
data = parse(lines, "raw").cache()
n = data.count()

# reduce dimensionality
comps, latent, scores = svd1(data, k, 0)

# whiten data
whtMat = real(dot(inv(diag(sqrt(latent))), comps))
unwhtMat = real(dot(transpose(comps), diag(sqrt(latent))))
wht = data.map(lambda x: dot(whtMat, x)).cache()

# do multiple independent component extraction
B = orth(random.randn(k, c))
Bold = zeros((k, c))
iterNum = 0
minAbsCos = 0
Beispiel #7
0
def get_data_fourier(self):
    return parse(self.sc.textFile(FISH_DATA), "dff").cache()
Beispiel #8
0
    result = data.join(means)

    # get correlations and sort by key so result is in the right order
    corr = result.map(lambda (k, v): (k, corrcoef(v[0], v[1])[0, 1])).sortByKey().map(
        lambda (k, v): v)

    return corr


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="correlate time series with neighbors")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("sz", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "localcorr", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess, "xyz").cache()

    corrs = localcorr(data, args.sz)

    outputdir = args.outputdir + "-localcorr"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(corrs, outputdir, "corr", "matlab")
Beispiel #9
0
    method = SigProcessingMethod.load("fourier", freq=freq)
    out = method.calc(data).cache()

    co = out.map(lambda x: x[0])
    ph = out.map(lambda x: x[1])

    return co, ph

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="compute a fourier transform on each time series")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("freq", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "fourier", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, "dff")

    co, ph = fourier(data, args.freq)

    outputdir = args.outputdir + "-fourier"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)
    saveout(co, outputdir, "co", "matlab")
    saveout(ph, outputdir, "ph", "matlab")
Beispiel #10
0
def get_data_tuning(self):
    return parse(self.sc.textFile(FISH_DATA), "dff").cache()
Beispiel #11
0
            lambda (k, x): x).mean()

    return ts


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="query time series data by averaging values for given indices")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("indsfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("mx_x", type=int)
    parser.add_argument("mx_y", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "query", pyFiles=egg)

    # TODO: use sortByKey instead of specifying mxX and mxY
    lines = sc.textFile(args.datafile)
    data = parse(lines, "dff", "linear", None, [args.mx_x, args.mx_y]).cache()

    ts = query(data, args.indsfile)

    outputdir = args.outputdir + "-query"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(ts, outputdir, "ts", "matlab")
Beispiel #12
0
def get_data_shotgun(self):
    return parse(self.sc.textFile(SHOTGUN_DATA), "raw", "linear", None, [1, 1]).cache()
Beispiel #13
0
def get_data_regression(self):
    return parse(self.sc.textFile(FISH_DATA), "dff").cache()
Beispiel #14
0
def get_data_kmeans(self):
    return parse(self.sc.textFile(IRIS_DATA), "raw")
Beispiel #15
0
def get_data_rpca(self):
    return parse(self.sc.textFile(RPCA_DATA), "raw")
Beispiel #16
0
    return RDD.map(lambda x: dot(x, vthresh))


def shrinkage(RDD, thresh):
    return RDD.map(lambda x: sign(x) * shrinkVec(x, thresh))

# parse inputs
sc = SparkContext(argsIn[0], "rpca")
dataFile = str(argsIn[1])
outputDir = str(argsIn[2]) + "-rpca"
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

# load data
lines = sc.textFile(dataFile)
data = parse(lines, "dff").cache()
n = data.count()
m = len(data.first())

# create broadcast variables
M = array(data.collect())
L = zeros((n, m))
S = zeros((n, m))
Y = zeros((n, m))

mu = float(12)
lam = 1/sqrt(n)

iterNum = 0
iterMax = 50