def test_ica(self): ica_data = os.path.join(DATA_DIR, "ica.txt") ica_results = os.path.join(DATA_DIR, "results/ica") data = parse(self.sc.textFile(ica_data), "raw") w, sigs = ica(data, 4, 4, svdmethod="direct", seed=1) w_true = loadmat(os.path.join(ica_results, "w.mat"))["w"] sigs_true = loadmat(os.path.join(ica_results, "sigs.mat"))["sigs"] tol = 10e-02 assert(allclose(w, w_true, atol=tol)) assert(allclose(transpose(sigs.collect()), sigs_true, atol=tol))
method = SigProcessingMethod.load("stats", statistic=statistic) vals = method.calc(data) return vals if __name__ == "__main__": parser = argparse.ArgumentParser(description="compute summary statistics on time series data") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("mode", choices=("mean", "median", "std", "norm"), help="which summary statistic") parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "ref", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() vals = stats(data, args.mode) outputdir = args.outputdir + "-stats", outputdir = args.outputdir + "-stats" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(vals, outputdir, "stats_" + args.mode, "matlab")
normDists = data.map(lambda p: closestPoint((p - mean(p)) / norm(p), centers, "corr")[1]) return labels, centers, dists, normDists if __name__ == "__main__": parser = argparse.ArgumentParser(description="do kmeans clustering") parser.add_argument("master", type=str) parser.add_argument("dataFile", type=str) parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing") parser.add_argument("outputDir", type=str) parser.add_argument("k", type=int) parser.add_argument("dist", choices=("euclidean", "correlation"), help="distance metric for kmeans") args = parser.parse_args() egg = glob.glob(os.environ["THUNDER_EGG"] + "*.egg") sc = SparkContext(args.master, "kmeans", pyFiles=egg) lines = sc.textFile(args.dataFile) data = parse(lines, args.dataMode).cache() labels, centers, dists, normDists = kmeans(data, args.k, args.dist) outputDir = args.outputDir + "-kmeans" if not os.path.exists(outputDir): os.makedirs(outputDir) saveout(labels, outputDir, "labels", "matlab") saveout(dists, outputDir, "dists", "matlab") saveout(centers, outputDir, "centers", "matlab") saveout(normDists, outputDir, "normDists", "matlab")
def get_data_query(self): return parse(self.sc.textFile(FISH_DATA), "dff", "linear", None, [88, 76]).cache()
def get_data_ref(self): return parse(self.sc.textFile(FISH_DATA), "raw", "xyz").cache()
if len(argsIn) < 5: print >> sys.stderr, "usage: ica <master> <inputFile> <outputFile> <k> <c>" exit(-1) # parse inputs sc = SparkContext(argsIn[0], "ica") dataFile = str(argsIn[1]) outputDir = str(argsIn[2]) + "-ica" k = int(argsIn[3]) c = int(argsIn[4]) if not os.path.exists(outputDir): os.makedirs(outputDir) # load data lines = sc.textFile(dataFile) data = parse(lines, "raw").cache() n = data.count() # reduce dimensionality comps, latent, scores = svd1(data, k, 0) # whiten data whtMat = real(dot(inv(diag(sqrt(latent))), comps)) unwhtMat = real(dot(transpose(comps), diag(sqrt(latent)))) wht = data.map(lambda x: dot(whtMat, x)).cache() # do multiple independent component extraction B = orth(random.randn(k, c)) Bold = zeros((k, c)) iterNum = 0 minAbsCos = 0
def get_data_fourier(self): return parse(self.sc.textFile(FISH_DATA), "dff").cache()
result = data.join(means) # get correlations and sort by key so result is in the right order corr = result.map(lambda (k, v): (k, corrcoef(v[0], v[1])[0, 1])).sortByKey().map( lambda (k, v): v) return corr if __name__ == "__main__": parser = argparse.ArgumentParser(description="correlate time series with neighbors") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("sz", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "localcorr", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess, "xyz").cache() corrs = localcorr(data, args.sz) outputdir = args.outputdir + "-localcorr" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(corrs, outputdir, "corr", "matlab")
method = SigProcessingMethod.load("fourier", freq=freq) out = method.calc(data).cache() co = out.map(lambda x: x[0]) ph = out.map(lambda x: x[1]) return co, ph if __name__ == "__main__": parser = argparse.ArgumentParser(description="compute a fourier transform on each time series") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("freq", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "fourier", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, "dff") co, ph = fourier(data, args.freq) outputdir = args.outputdir + "-fourier" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(co, outputdir, "co", "matlab") saveout(ph, outputdir, "ph", "matlab")
def get_data_tuning(self): return parse(self.sc.textFile(FISH_DATA), "dff").cache()
lambda (k, x): x).mean() return ts if __name__ == "__main__": parser = argparse.ArgumentParser(description="query time series data by averaging values for given indices") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("indsfile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("mx_x", type=int) parser.add_argument("mx_y", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "query", pyFiles=egg) # TODO: use sortByKey instead of specifying mxX and mxY lines = sc.textFile(args.datafile) data = parse(lines, "dff", "linear", None, [args.mx_x, args.mx_y]).cache() ts = query(data, args.indsfile) outputdir = args.outputdir + "-query" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(ts, outputdir, "ts", "matlab")
def get_data_shotgun(self): return parse(self.sc.textFile(SHOTGUN_DATA), "raw", "linear", None, [1, 1]).cache()
def get_data_regression(self): return parse(self.sc.textFile(FISH_DATA), "dff").cache()
def get_data_kmeans(self): return parse(self.sc.textFile(IRIS_DATA), "raw")
def get_data_rpca(self): return parse(self.sc.textFile(RPCA_DATA), "raw")
return RDD.map(lambda x: dot(x, vthresh)) def shrinkage(RDD, thresh): return RDD.map(lambda x: sign(x) * shrinkVec(x, thresh)) # parse inputs sc = SparkContext(argsIn[0], "rpca") dataFile = str(argsIn[1]) outputDir = str(argsIn[2]) + "-rpca" if not os.path.exists(outputDir): os.makedirs(outputDir) # load data lines = sc.textFile(dataFile) data = parse(lines, "dff").cache() n = data.count() m = len(data.first()) # create broadcast variables M = array(data.collect()) L = zeros((n, m)) S = zeros((n, m)) Y = zeros((n, m)) mu = float(12) lam = 1/sqrt(n) iterNum = 0 iterMax = 50