#!/usr/bin/env python import climate import gzip import io import joblib import lmj.cubes import numpy as np import os import pandas as pd import pickle import sklearn.decomposition logging = climate.get_logger('encode') def load_jacobian(fn): df = pd.read_csv(fn, index_col='time').dropna() cols = [c for c in df.columns if c.startswith('jac')] return df[cols].astype('f') def encode(pca, fn): df = load_jacobian(fn) cols = [c for c in df.columns if c.startswith('jac')] xf = pca.transform(df[cols].values) k = xf.shape[1] for c in cols: del df[c] for i in range(k): df['pc-{}'.format(i)] = xf[:, i]
import numpy as np import theanets import climate import matplotlib.pyplot as plt from scipy.interpolate import interp1d logging = climate.get_logger('lstm-joint') climate.enable_default_logging() def plotLearningCurve(): fig=plt.figure(0, figsize=(10,8) ) fig.clf() plt.plot(trn_loss, label='Training Set Error', linestyle="--", linewidth=2) plt.plot(valid_loss, label='Validation Set Error', linewidth=2) plt.title('Cross-Entropy Error') plt.xlabel('Epoch') plt.ylabel('Error') plt.legend() plt.savefig("error") # Original Joint data BallLiftJoint = np.loadtxt('../../../20fpsFullBehaviorSampling/BallLift/JointData.txt').astype(np.float32) BallRollJoint = np.loadtxt('../../../20fpsFullBehaviorSampling/BallRoll/JointData.txt').astype(np.float32) BellRingLJoint = np.loadtxt('../../../20fpsFullBehaviorSampling/BellRingL/JointData.txt').astype(np.float32) BellRingRJoint = np.loadtxt('../../../20fpsFullBehaviorSampling/BellRingR/JointData.txt').astype(np.float32) BallRollPlateJoint = np.loadtxt('../../../20fpsFullBehaviorSampling/BallRollPlate/JointData.txt').astype(np.float32) RopewayJoint = np.loadtxt('../../../20fpsFullBehaviorSampling/Ropeway/JointData.txt').astype(np.float32) jointRemap = interp1d([-2.2,2.2],[-1,1])
import climate import numpy as np import pickle from sklearn.decomposition import PCA logging = climate.get_logger('train-pca') @climate.annotate( filenames='compute PCA from these datasets', ) def main(*filenames): pca = PCA(n_components=None) pca.fit(np.vstack([np.load(f, mmap_mode='r') for f in filenames])) r = pca.explained_variance_ratio_ for variance in (0.5, 0.8, 0.9, 0.95, 0.98, 0.99, 0.995, 0.998, 0.999): logging.info('to explain variance %.3f -> %4d components', variance, r.searchsorted(variance)[0]) if __name__ == '__main__': climate.call(main)
import climate import lmj.cubes import lmj.plot import numpy as np import pandas as pd import theanets logging = climate.get_logger('posture->jac') BATCH = 256 THRESHOLD = 100 def load_markers(fn): df = pd.read_csv(fn, index_col='time').dropna() cols = [c for c in df.columns if c.startswith('marker') and c[-1] in 'xyz'] return df[cols].astype('f') def load_jacobian(fn): df = pd.read_csv(fn, index_col='time').dropna() cols = [c for c in df.columns if c.startswith('pc')] return df[cols].astype('f') def main(root): match = lmj.cubes.utils.matching bodys = [load_markers(f) for f in sorted(match(root, '*_body.csv.gz'))] nbody = bodys[0].shape[1] logging.info('loaded %d body-relative files', len(bodys))
validation set. Clearly overtraining is a critical issue here. This example only works with Python 2 at the moment. """ import climate import io import numpy as np import theanets import scipy.io import os import tempfile import urllib import zipfile logging = climate.get_logger("lstm-chime") climate.enable_default_logging() BATCH_SIZE = 32 TRAIN_NC = os.path.join(tempfile.gettempdir(), "chime1_train.nc") VALID_NC = os.path.join(tempfile.gettempdir(), "chime1_valid.nc") ZIPURL = "https://github.com/craffel/lstm_benchmarks/archive/master.zip" # If needed, get the data files from https://github.com/craffel/lstm_benchmarks. if not os.path.isfile(TRAIN_NC) or not os.path.isfile(VALID_NC): logging.info("attempting data copy from url: %s", ZIPURL) z = zipfile.ZipFile(io.BytesIO(urllib.urlopen(ZIPURL).read())) with open(TRAIN_NC, "wb") as savefile: savefile.write(z.read("lstm_benchmarks-master/data/train_1_speaker.nc")) with open(VALID_NC, "wb") as savefile:
import climate import glob import numpy as np import os import pickle from sklearn.decomposition import PCA logging = climate.get_logger("compress") @climate.annotate( data="process data in this directory", pattern="process files matching this pattern", components=("compress using this many PCA components", "option"), whiten=("whiten the compressed data", "option"), ) def main(data, pattern, components="mle", whiten=None): desc = components if components.isdigit(): components = int(components) desc = "k{}".format(components) if components.replace(".", "").isdigit(): components = float(components) desc = "r{}".format(components) whiten = bool(whiten) if whiten: desc += "-white" matches = sorted(glob.glob(os.path.join(data, "eur??-{}.npy".format(pattern))))
import re import climate import ConfigParser import theano import theano.tensor as T import theano.sandbox.rng_mrg import lasagne from lasagne.layers import ReshapeLayer,Layer from lasagne.init import Normal from lasagne.regularization import regularize_layer_params_weighted, l2, l1 from lasagne.regularization import regularize_layer_params logging = climate.get_logger('trainer') climate.enable_default_logging() def load_model(filename): f=file(filename,'rb') params=cPickle.load(f) f.close() return params def save_model(filename, model): params=lasagne.layers.get_all_param_values(model) f = file(filename, 'wb') cPickle.dump(params,f,protocol=cPickle.HIGHEST_PROTOCOL) f.close()
import climate import matplotlib.pyplot as plt import numpy as np import sklearn.cluster import sklearn.utils logging = climate.get_logger('train-kmeans') @climate.annotate( output='save centroids here', filename='train clustering on this file', clusters=('number of clusters', 'option', None, int), ) def main(output, filename, clusters): X = np.load(filename, mmap_mode='r') idx = np.random.permutation(len(X)) cut = int(0.9 * len(idx)) model = sklearn.cluster.MiniBatchKMeans( batch_size=10 * clusters, max_no_improvement=10 * clusters, n_clusters=clusters, n_init=13, reassignment_ratio=0.01, verbose=True, ) model.fit(X[idx[:cut]]) D = model.cluster_centers_ logging.info('%s: saving %s codebook', output, D.shape) np.save(output, D.astype('f'))
import c3d import importlib import io import unittest from test.base import Base from test.zipload import Zipload climate_spec = importlib.util.find_spec("climate") if climate_spec: import climate # If climate exist if climate_spec: logging = climate.get_logger('test') climate.enable_default_logging() class ReaderTest(Base): def test_format_pi(self): r = c3d.Reader(Zipload._get('sample01.zip', 'Eb015pi.c3d')) self._log(r) assert r.point_used == 26 assert r.point_rate == 50 def test_format_pr(self): r = c3d.Reader(Zipload._get('sample01.zip', 'Eb015pr.c3d')) self._log(r) assert r.point_used == 26 assert r.point_rate == 50 def test_paramsa(self): r = c3d.Reader(Zipload._get('sample08.zip', 'TESTAPI.c3d'))
import climate import lmj.plot as plt import numpy as np import seaborn as sns import theanets logging = climate.get_logger('rica') import models climate.add_arg('--codebook', metavar='FILE', help='save codebook to FILE') climate.add_arg('--frames', type=int, metavar='T', help='train on sequences of T frames') climate.add_arg('--overcomplete', type=float, default=2, metavar='K', help='learn a Kx overcomplete codebook') def main(args): data = np.load(args.dataset, mmap_mode='r') N = data.shape[1] T = args.frames K = int(N * T * args.overcomplete) def batches(): batch = np.zeros((args.batch_size, N * T), 'f') for b in range(args.batch_size): o = np.random.randint(len(data) - T - 1) batch[b] = data[o:o+T].ravel() return [batch] net = theanets.Autoencoder([N * T, (K, 'linear'), (N * T, 'tied')]) net.train(batches,
#!/usr/bin/env python from __future__ import division import climate import lmj.cubes import lmj.cubes.fill import lmj.pca import numpy as np import pandas as pd logging = climate.get_logger('fill-linear') def fill(dfs, rank, window): '''Complete missing marker data using linear interpolation. This method alters the given `dfs` in-place. Parameters ---------- dfs : list of pd.DataFrame Frames of source data. The frames will be stacked into a single large frame and interpolated linearly, either in the data space or (if rank is not None) in principal component space. rank : float Number of principal components (if >1) or fraction of variance (if in (0, 1)) to retain in the encoded data. window : int Model windows of this many consecutive frames. '''
#!/usr/bin/env python import climate import cPickle as pickle import gzip import numpy as np logging = climate.get_logger('theanets-untie') @climate.annotate( source='load a saved network from FILE', target='save untied network weights to FILE', ) def main(source, target): opener = gzip.open if source.endswith('.gz') else open p = pickle.load(opener(source)) logging.info('read from %s:', source) for w, b in zip(p['weights'], p['biases']): logging.info('weights %s bias %s %s', w.shape, b.shape, b.dtype) p['weights'].extend(0 + w.T for w in p['weights'][::-1]) p['biases'].extend(-b for b in p['biases'][-2::-1]) p['biases'].append(np.zeros( (len(p['weights'][0]), ), p['biases'][0].dtype)) logging.info('writing to %s:', target) for w, b in zip(p['weights'], p['biases']): logging.info('weights %s bias %s %s', w.shape, b.shape, b.dtype) opener = gzip.open if target.endswith('.gz') else open
import climate import ConfigParser import io import numpy as np import theanets import scipy.io import os import tempfile import urllib import zipfile import pdb import glob import random import sys logging = climate.get_logger('lstm-chime') climate.enable_default_logging() def map_train_val_id(fn, fn_train_id, fn_val_id): dict_train = {} dict_val = {} with open(fn_train_id, 'r') as fid: for aline in fid: parts = aline.strip().split() dict_train[parts[0]] = 1 with open(fn_val_id, 'r') as fid: for aline in fid: parts = aline.strip().split()
from __future__ import print_function import climate import numpy as np import os import re from constants import constants as C logging = climate.get_logger('import-csvs') def main(root='/tmp/measurements', output=None): data = [] for s in os.listdir(root): subject = [] for b in os.listdir(os.path.join(root, s)): block = [] bweight, bspeed, bhand, bpaths = b.split('-')[1:] for t in os.listdir(os.path.join(root, s, b)): thand, tspeed = re.search(r'(left|right)-speed_(\d\.\d+)', t).groups() config = np.tile([ C[bweight], C[bspeed], C[bhand], C[bpaths], C[thand], float(tspeed)], (120, 1)) block.append( np.hstack([ config, np.loadtxt(os.path.join(root, s, b, t), skiprows=1, delimiter=',')])) subject.append(block) if len(subject) == 3:
#!/usr/bin/env python import climate import matplotlib.pyplot as plt import numpy as np import theanets from utils import load_mnist, plot_layers, plot_images logging = climate.get_logger('mnist-rica') climate.enable_default_logging() class WeightInverse(theanets.Regularizer): def loss(self, layers, outputs): return sum((1 / (w * w).sum(axis=0)).sum() for l in layers for w in l.params if w.ndim > 1) (train, ), (valid, ), _ = load_mnist() # mean-center the digits and compute a pca whitening transform. m = train.mean(axis=0) train -= m valid -= m logging.info('computing whitening transform') vals, vecs = np.linalg.eigh(np.dot(train.T, train) / len(train)) vals = vals[::-1]
import torch.nn as nn import numpy as np import os import glob import pdb #from data_loader_lstm import get_loader from data_loader_lstm_gaussian_aug import get_loader from model_lstm import SkeletonAction_AVG_H as SkeletonAction from torch.autograd import Variable import torch.nn.functional as F from torch.nn.utils.clip_grad import clip_grad_norm import climate import logging logging = climate.get_logger(__name__) climate.enable_default_logging() def main(args): # Build data loader if not os.path.isdir(args.model_path): os.makedirs(args.model_path) data_loader,ds_class = get_loader(args.data_dir, args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds) # Build eval data loader if hasattr(ds_class, 'lbl2id'): eval_data_loader,_ = get_loader(args.data_dir_test, args.batch_size, shuffle=True, num_workers=args.num_workers, ds = args.ds, lbl2id = ds_class.lbl2id)
#!/usr/bin/env python import climate import matplotlib.pyplot as plt import numpy as np import theanets from utils import load_cifar, plot_layers, plot_images logging = climate.get_logger('cifar') g = climate.add_group('CIFAR Example') g.add_argument('--features', type=int, default=0, metavar='N', help='train a model using N^2 hidden-layer features') K = 655 # this retains 99% of the variance in the cifar images. def pca(dataset): mean = dataset[:3000].mean(axis=0) logging.info('computing whitening transform') x = dataset[:3000] - mean vals, vecs = np.linalg.eigh(np.dot(x.T, x) / len(x)) vals = vals[::-1] vecs = vecs[:, ::-1] vals = np.sqrt(vals[:K]) vecs = vecs[:, :K] def whiten(x):
#!/usr/bin/env python from __future__ import division import climate import joblib import lmj.cubes import pandas as pd import scipy.signal logging = climate.get_logger('lowpass') def lowpass(df, freq=10., order=4): '''Filter marker data using a butterworth low-pass filter. This method alters the data in `df` in-place. Parameters ---------- freq : float, optional Use a butterworth filter with this cutoff frequency. Defaults to 10Hz. order : int, optional Order of the butterworth filter. Defaults to 4. ''' nyquist = 1 / (2 * pd.Series(df.index).diff().mean()) assert 0 < freq < nyquist passes = 2 # filtfilt makes two passes over the data. correct = (2 ** (1 / passes) - 1) ** 0.25 b, a = scipy.signal.butter(order / passes, (freq / correct) / nyquist) for c in df.columns:
#!/usr/bin/env python import climate import lmj.cubes import lmj.cubes.fill import numpy as np import pandas as pd logging = climate.get_logger('fill') def svt(dfs, threshold, window): '''Complete missing marker data using singular value thresholding. This method alters the given `dfs` in-place. Parameters ---------- dfs : list of pd.DataFrame Frames of source data. The frames will be stacked into a single large frame to use during SVT. This stacked frame will then be split and returned. threshold : float Threshold for singular values. If none, use a value computed from the spectrum of singular values. window : int Model windows of this many consecutive frames. ''' df = lmj.cubes.fill.stack(dfs, window) centers = lmj.cubes.fill.center(df) pos, vis, data_norm = lmj.cubes.fill.window(df, window)
#!/usr/bin/env python import climate import joblib import lmj.cubes import numpy as np import pandas as pd logging = climate.get_logger('reindex') # this is the set of markers that gets included in our output. MARKERS = [ 'marker00-r-head-back', 'marker01-r-head-front', 'marker02-l-head-front', 'marker03-l-head-back', #'marker04-r-head-mid', #'marker05-l-head-mid', 'marker06-r-collar', 'marker07-r-shoulder', 'marker08-r-elbow', 'marker09-r-wrist', #'marker10-r-fing-pinky', #'marker11-r-fing-ring', #'marker12-r-fing-middle', 'marker13-r-fing-index', 'marker14-r-mc-outer', #'marker15-r-mc-inner', #'marker16-r-thumb-base', #'marker17-r-thumb-tip', 'marker18-l-collar',
#!/usr/bin/env python import climate import numpy as np import os import scipy.io.wavfile import segmentaxis logging = climate.get_logger('extract-windows') # relatively prime window sizes, primes < 2**k (window size in ms): # [3 7 13 31 61] 127 (7.94) 251 (15.68) 509 (31.81) 1021 (63.81) @climate.annotate( width=('generate windows of N samples', 'option', None, int), overlap=('overlap windows by R fraction of width', 'option', None, float), samplerate=('die if any audio files are not N fps', 'option', None, int), root='save outputs to this directory', audio='extract windows from these wav files', ) def main(width, overlap, samplerate, root, *audio): if not samplerate: samplerate = 16000 if not width: width = 512 if not overlap: overlap = 0.75 env = np.hanning(width)[None, :].astype('f') for f in audio: rate, samples = scipy.io.wavfile.read(f)
#!/usr/bin/env python from __future__ import division import climate import lmj.cubes import lmj.cubes.fill import lmj.pca import numpy as np import pandas as pd logging = climate.get_logger("fill-linear") def fill(dfs, rank, window): """Complete missing marker data using linear interpolation. This method alters the given `dfs` in-place. Parameters ---------- dfs : list of pd.DataFrame Frames of source data. The frames will be stacked into a single large frame and interpolated linearly, either in the data space or (if rank is not None) in principal component space. rank : float Number of principal components (if >1) or fraction of variance (if in (0, 1)) to retain in the encoded data. window : int Model windows of this many consecutive frames. """
import ConfigParser import climate import io import numpy as np import theanets import scipy.io import os import tempfile import urllib import zipfile import pdb import glob import random import sys logging = climate.get_logger('lstm-chime') climate.enable_default_logging() def main(layer_nums, data_dir, model_dir, val_dir, sep_data_dir, sep_val_dir, **kwargs): layer_nums = [ int(num) for num in layer_nums ] hidden_l1 = None if 'hidden_l1' in kwargs: hidden_l1 = float(kwargs['hidden_l1']) l1 = None if 'l1' in kwargs: l1 = float(kwargs['l1'])
#!/usr/bin/env python import climate import collections import joblib import lmj.cubes import lmj.plot import numpy as np logging = climate.get_logger("count") def count(trial): trial.load() trial.mask_dropouts() total = len(trial.df) markers = {m: trial.df[m + "-c"].count() / total for m in trial.marker_columns} full = len(trial.df[[m + "-c" for m in markers]].dropna(axis=0)) trial.log("%d rows, %d full (%.1f%%)", total, full, 100 * full / total) return markers PERCENTILES = [1, 2, 5, 10, 20, 50, 80, 90, 95, 98, 99] def main(root): trials = lmj.cubes.Experiment(root).trials_matching("*") counts = collections.defaultdict(int) percents = collections.defaultdict(list) f = joblib.delayed(count) for markers in joblib.Parallel(-1)(f(t) for t in trials):
def __init__(self, name): self.__name__ = name name = '.'.join([self.__class__.__name__]) self.logger = climate.get_logger(name)
import climate import numpy as np import sklearn.decomposition logging = climate.get_logger('train-dict') @climate.annotate( output='save dictionary here', filename='train dictionary on this file', alpha=('sparsity penalty', 'option', None, float), features=('number of dictionary features', 'option', None, int), ) def main(output, filename, features, alpha=0.01): X = np.load(filename, mmap_mode='r') idx = range(len(X)) np.random.shuffle(idx) model = sklearn.decomposition.MiniBatchDictionaryLearning( batch_size=features, n_components=features, alpha=alpha, dict_init=X[idx[:features]], shuffle=False, verbose=1, n_jobs=-2, ) np.random.shuffle(idx) model.fit(X[idx]) D = model.components_ logging.info('saving dictionary %s', D.shape) np.save(output, D)
#!/usr/bin/env python import climate import joblib import numpy as np import sklearn.utils import scipy.fftpack logging = climate.get_logger('wave->spec') def spectra(z): '''Compute a spectrogram of the given window.''' width = z.shape[1] # or for power spectral density #spec ** 2 / (width * samplerate) return abs(scipy.fftpack.fft(z))[:, :1 + width // 2].astype('f') @climate.annotate( dataset='load windowed waveforms from this npy file', clip=('clip resulting spectra values at N', 'option', None, float), ) def main(dataset, clip=None): X = np.load(dataset, mmap_mode='r') logging.info('%s: windows %s %s', dataset, X.shape, X.dtype) slices = sklearn.utils.gen_even_slices(len(X), 100) spec = joblib.delayed(spectra) Y = np.vstack(joblib.Parallel(n_jobs=-2)(spec(X[c]) for c in slices)) if clip:
import c3d import climate import io import os import tempfile import unittest import urllib import zipfile logging = climate.get_logger('test') climate.enable_default_logging() TEMP = os.path.join(tempfile.gettempdir(), 'c3d-test') ZIPS = ( ('https://www.c3d.org/data/sample01.zip', 'formats.zip'), ('https://www.c3d.org/data/sample07.zip', 'analog.zip'), ('https://www.c3d.org/data/sample08.zip', 'params.zip'), ) class Base(unittest.TestCase): def setUp(self): if not os.path.isdir(TEMP): os.makedirs(TEMP) for url, target in ZIPS: fn = os.path.join(TEMP, target) if not os.path.isfile(fn): try: urllib.urlretrieve(url, fn) except AttributeError: # python 3 urllib.request.urlretrieve(url, fn)
#!/usr/bin/env python import climate import matplotlib.pyplot as plt import numpy as np import theanets from utils import load_mnist, plot_layers, plot_images logging = climate.get_logger('mnist-rica') climate.enable_default_logging() class RICA(theanets.Autoencoder): def loss(self, weight_inverse=0, **kwargs): loss = super(RICA, self).loss(**kwargs) if weight_inverse > 0: loss += sum((weight_inverse / (w * w).sum(axis=0)).sum() for l in self.layers for w in l.params if w.ndim > 1) return loss train, valid, _ = load_mnist() # mean-center the digits and compute a pca whitening transform. train -= 0.5 valid -= 0.5
import climate #import lmj.plot import numpy as np import mel import audio logging = climate.get_logger('play-mel') @climate.annotate( source='dataset to read', target=('wave file output', 'option'), pca=('pca transform', 'option'), start=('start at the Nth audio window', 'option', None, int), seconds=('play N seconds of audio', 'option', None, float), ) def main(source, target='', pca='', start=0, seconds=10.): b = audio.Builder() mels = b.read(source, pca, start, seconds) if b.log: mels = np.exp(mels) # reconstruct the linearly-spaced spectrum by inverting the triangular # mel-spaced filterbank. since the filterbank converter is a rectangular # matrix, this will inevitably result in some loss, but by keeping the # number of mel filters relatively large we can minimize that loss. filters, _ = mel.filterbank(b.width // 2 + 1, mels.shape[1]) finv = np.linalg.pinv(filters) mags = np.dot(mels, finv) logging.info('inverting mels %s x inverse %s => mags %s',
# SOFTWARE. '''This file contains recurrent network structures.''' import climate import numpy as np import numpy.random as rng import theano import theano.tensor as TT #from theano.tensor.shared_randomstreams import RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from . import feedforward as ff logging = climate.get_logger(__name__) class Network(ff.Network): '''A fully connected recurrent network with one input and one output layer. Parameters ---------- layers : sequence of int A sequence of three integers specifying the number of units in the input, hidden, and output layers, respectively. activation : callable(numeric) -> numeric A callable that takes one argument (a matrix) and returns another matrix. This is the activation function that each hidden unit in the network uses.
#!/usr/bin/env python import climate import cPickle as pickle import gzip import numpy as np logging = climate.get_logger('theanets-untie') @climate.annotate( source='load a saved network from FILE', target='save untied network weights to FILE', ) def main(source, target): opener = gzip.open if source.endswith('.gz') else open p = pickle.load(opener(source)) logging.info('read from %s:', source) for w, b in zip(p['weights'], p['biases']): logging.info('weights %s bias %s %s', w.shape, b.shape, b.dtype) p['weights'].extend(0 + w.T for w in p['weights'][::-1]) p['biases'].extend(-b for b in p['biases'][-2::-1]) p['biases'].append(np.zeros((len(p['weights'][0]), ), p['biases'][0].dtype)) logging.info('writing to %s:', target) for w, b in zip(p['weights'], p['biases']): logging.info('weights %s bias %s %s', w.shape, b.shape, b.dtype)