def __init__(self, ts, ticker, corpus, filter): self.ticker = ticker self.corpus = corpus self.dts = ts.select('price_{}'.format(ticker))[0] self.price = ts.select('price_{}'.format(ticker))[1] self.crsp = ts.select('CRSP')[1][1:] self.returns = np.diff(np.log(self.price)) self.adj_returns = zscore(self.returns - self.crsp) self.nt = ts.select('{}_{}_{}'.format(corpus, ticker, filter))[1][1:] self.sent = zscore(self.nt) self.friday = np.array([ts.select('friday')[1][1:]]) self.jan = np.array([ts.select('january')[1][1:]]) self.NWD = np.array([ts.select('NWD')[1][1:]])
def fit_model(self, bin=False): mt = sio.loadmat(self.data_path + self.mouse_filename) # neurons by timepoints self.X = mt['Fsp'] self.motionSVD = np.array(mt['beh'][0]['face'][0]['motionSVD'][0][0]).T self.parea = np.array(mt['beh'][0]['pupil'][0]['area'][0][0]) if bin == True: self.X, self.motionSVD, self.parea = bin_data( self.X, self.motionSVD, self.parea) else: self.nt = self.motionSVD.shape[1] tbin = 1 self.motionSVD = np.reshape( self.motionSVD[:, :self.nt * tbin], (self.motionSVD.shape[0], self.nt, tbin)).mean(axis=-1) self.parea = np.reshape(self.parea[:self.nt * tbin], (self.nt, tbin)).mean(axis=-1) if self.model == 'EnsemblePursuit_numpy': options_dict = {'seed_neuron_av_nr': 100, 'min_assembly_size': 8} ep_np = EnsemblePursuitNumpy(n_ensembles=self.nr_of_components, lambd=0.01, options_dict=options_dict) U, V = ep_np.fit_transform(self.X) if self.save == True: bundle = {'U': U, 'V': V} np.save( self.save_path + self.mouse_filename + '_spont_ep_numpy.npy', bundle) return U, V if self.model == 'EnsemblePursuit_pytorch': options_dict = {'seed_neuron_av_nr': 100, 'min_assembly_size': 8} ep_np = EnsemblePursuitPyTorch(n_ensembles=self.nr_of_components, lambd=0.01, options_dict=options_dict) U, V = ep_np.fit_transform(self.X) if self.save == True: bundle = {'U': U, 'V': V} np.save( self.save_path + self.mouse_filename + '_spont_ep_numpy.npy', bundle) return U, V if self.model == 'NMF': print(self.X[self.X < 0]) #self.X-=self.X.min(axis=0) self.X[self.X < 0] = 0 self.X = self.X.T model = NMF(n_components=self.nr_of_components, init='nndsvd', random_state=7) V = model.fit_transform(self.X) U = model.components_ return U.T, V if self.model == 'ICA': self.X = zscore(self.X) self.X = self.X.T ICA = FastICA(n_components=self.nr_of_components, random_state=7) V = ICA.fit_transform(self.X) U = ICA.components_ return U.T, V
def variance_explained_across_neurons(self, U, V): ''' The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). ''' #Fetch the original data and convert it into the same form as what goes into the #matrix factorization model mt = sio.loadmat(self.data_path + self.mouse_filename) # neurons by timepoints X = mt['Fsp'] if self.model == 'EnsemblePursuit_pytorch' or self.model == 'EnsemblePursuit_numpy': X = zscore(X.T).T u = [] v = [] approx = U @ V.T for j in range(X.shape[0]): u_j = ((X[j, :] - approx[j, :])**2).sum() v_j = ((X[j, :] - np.mean(X[j, :]))**2).sum() u.append(u_j) v.append(v_j) u = np.array(u) v = np.array(v) plt.plot(-np.divide(u, v) + 1) plt.title('Variance explained across neurons') plt.show() print('Total variance explained, averaged over neurons is:', (1 - np.mean(u) / np.mean(v)))
def plot_dat(self, anomaly, data, standardise = 1): """ For a given detected anomaly, plots the data around that time point """ # Need to add input checks sample_half = int(round( self.p['zt_sample_size']/2.)) dat = data[anomaly-sample_half:anomaly+sample_half,:] if standardise: fig = plt.figure() ax = fig.add_subplot(111) ax.plot(zscore(dat)) bpList = bp_lookup(self.p['SAX_alphabet_size']) for bp in bpList: ax.axhline(y=bp, xmin=0, xmax=dat.shape[0], ls = '--', color = 'k') adjust_spines(ax, ['bottom']) ax.set_yticklabels([]) ax.yaxis.set_ticks([]) ax.set_xticklabels(range(anomaly-sample_half,anomaly+sample_half+1)) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(18) else: plt.figure() plt.plot(dat) bpList = bp_lookup(self.p['SAX_alphabet_size']) plt.hlines(bpList, xmin=0, xmax=dat.shape[0]-1, linestyles = 'dashed', color = 'k')
def variance_explained_across_neurons(self, U, V): ''' From sklearn: The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). ''' #Fetch the original data and convert it into the same form as what goes into the #matrix factorization model data = io.loadmat(self.data_path + self.mouse_filename) resp = data['stim'][0]['resp'][0] spont = data['stim'][0]['spont'][0] X = subtract_spont(spont, resp).T X = zscore(X.T).T u = [] v = [] approx = U @ V.T for j in range(X.shape[0]): u_j = ((X[j, :] - approx[j, :])**2).sum() v_j = ((X[j, :] - np.mean(X[j, :]))**2).sum() u.append(u_j) v.append(v_j) u = np.array(u) v = np.array(v) plt.plot(-np.divide(u, v) + 1) plt.title('Variance explained across neurons') plt.show() print('Total variance explained, averaged over neurons is:', (1 - np.mean(u) / np.mean(v)))
def __init__(*args): repeat = int(args[0]) if len(args) >0 else 0 if len(args) >1: if type(args[1]) == str: date1 = datetime.strptime(args[1], '%Y-%m-%d').date() else: date1 = args[1] # date1 = datetime.strptime(args[1], '%Y-%m-%d').date() if len(args) >1 else None offset = int(args[2]) if len(args) >2 else None exclude_index = 1 results = [] need_create = {} for i in range(repeat): delta = i * REPORT_OFFSET2 start_date = date1 - timedelta(offset*(delta+1)) end_date = date1 - timedelta(offset*delta) # print(date1, start_date, end_date) path = os.path.join('data', 'zscore', f'{start_date}_{end_date}.json') try: results.append(load_json(path)) except Exception as e: results.append({}) need_create[i] = (start_date, end_date) if len(need_create): data_all = load_stocklist_json() print('len: ', len(data_all)) for i, d in enumerate(data_all): if exclude_index and is_index_stock(d['codeName']): continue full_code = d['full_code'] output = load_stock_json(full_code, start_date=datetime.now().date())['output'] for ii, dates in need_create.items(): z_score = zscore(output, dates[0], dates[1]) if z_score: print(i, full_code, dates[0], dates[1], z_score) results[ii][full_code] = z_score for ii, dates in need_create.items(): path = os.path.join('data', 'zscore', f'{dates[0]}_{dates[1]}.json') save_json(results[ii], path) return results
def SAX(data, alphabet_size, word_size, minstd = 1.0, pre_normed = False): """ Returns one word for each data stream word_size == Number of segments data is split into for PAA alphabet_size == Number of symbols used Also now compatable with a single data stream. """ if data.ndim == 1: num_streams = 1 data = np.atleast_2d(data) data = data.T else: num_streams = data.shape[1] # Need to insert check here for stationary segemnts mask = data.std(axis=0) < minstd passed = np.invert(mask) if np.any(mask): # Scale data to have a mean of 0 and a standard deviation of 1. if pre_normed == False: data[:,passed] = zscore(data[:, passed]) symbol4skips = string.ascii_letters[int(np.ceil(alphabet_size/2.))] else: # Scale data to have a mean of 0 and a standard deviation of 1. if pre_normed == False: data = zscore(data) # Calculate our breakpoint locations. breakpoints = bp_lookup(alphabet_size) breakpoints = np.concatenate((breakpoints, np.array([np.Inf]))) # Split the data into a list of word_size pieces. dataWords = np.array_split(data, word_size, axis=0) # Predifine Matrices segment_means = np.zeros((word_size,num_streams)) #segment_symbol = np.zeros((word_size,num_streams), dtype = np.str) p_array = np.zeros((num_streams,),dtype = ('a1,' * word_size + 'i2')) p_dict = {} # Calculate the mean for each section. for i in range(word_size): segment_means[i,passed] = dataWords[i][:,passed].mean(axis = 0) # Figure out which break each section is in based on the section_means and # calculated breakpoints. for i in range(num_streams): for j in range(word_size): if passed[i]: idx = int(np.where(breakpoints > segment_means[j,i])[0][0]) # Store in phrase_array p_array[i][j] = string.ascii_letters[idx] else: p_array[i][j] = symbol4skips # Store in phrase_dict phrase = ''.join(tuple(p_array[i])[:word_size]) if p_dict.has_key(phrase): p_dict[phrase].append(i) else: p_dict[phrase] = [i] # Put frequency of pattern in p_array for vals in p_dict.itervalues(): count = len(vals) for i in range(count): p_array[vals[i]][-1] = count return p_array, p_dict, segment_means
def fit(self, X): nK = self.n_components lam = self.lam n_kmeans = self.n_kmeans NT, NN = X.shape # z-score along time dimension X = utils.zscore(X, axis=0) # convert to float64 for numerical precision X = np.float64(X) # initialize k-means clusters and compute their variance in vm V, vm = initialize_kmeans(X, n_kmeans, lam) # initialize vectors in ensemble pursuit (Vs) vs = np.zeros((NT, nK)) # initialize U U = np.zeros((NN, nK)) # precompute covariance matrix of neurons C = X.T @ X # keep track of number of neurons per ensemble ns = np.zeros(nK, ) # time the ensemble pursuit t0 = time.time() # keep track of neuron order in ensembles self.order = [] self.seed = np.zeros((NT, nK)) self.cost_deltas = [] # outer loop for j in range(nK): # initialize with "biggest" k-means ensemble (by variance) imax = np.argmax(vm) # zscore the seed trace seed = zscore(V[:, imax]) # fit one ensemble starting from this seed iorder, current_v, cost_delta_lst = new_ensemble(X, C, seed, lam) self.order.append(iorder) self.seed[:, j] = seed # keep track of number of neurons ns[j] = len(iorder) # normalize current_v to unit norm current_v /= np.sum(current_v**2)**.5 # update column of Vs vs[:, j] = current_v # projection of each neuron onto this ensemble trace w = current_v @ X # update weights for neurons in this ensemble U[iorder, j] = w[iorder] # update activity trace X[:, iorder] -= np.outer(current_v, w[iorder]) # rank one update to C using wtw wtw = np.outer(w[iorder], w) # update the columns C[:, iorder] -= wtw.T # update the rows C[iorder, :] -= wtw # add back term for the submatrix of neurons in this ensemble C[iorder[:, np.newaxis], iorder] += wtw[:, iorder] # run one round of k-means because we changed X V, vm = one_round_of_kmeans(V, X, lam) self.cost_deltas.append(cost_delta_lst) if j % 25 == 0 or j == nK - 1: print('ensemble %d, time %2.2f, nr neurons %d, EV %2.4f' % (j, time.time() - t0, len(iorder), 1 - np.mean(X**2))) print('average sparsity is %2.4f' % (np.mean(U > 1e-5))) self.components_ = vs self.weights = U self.residual_kmeans = V # the fit function has to return the model return self
def fit_model(self): for filename in self.mat_file_lst: print(filename) data = io.loadmat(self.data_path + filename) resp = data['stim'][0]['resp'][0] spont = data['stim'][0]['spont'][0] if self.model == 'EnsemblePursuit_numpy': X = subtract_spont(spont, resp).T options_dict = { 'seed_neuron_av_nr': 100, 'min_assembly_size': 8 } ep_np = EnsemblePursuitNumpy(n_ensembles=self.nr_of_components, lambd=self.lambd_, options_dict=options_dict) start = time.time() U, V = ep_np.fit_transform(X) end = time.time() tm = end - start print('Time', tm) np.save(self.save_path + filename + '_V_ep_numpy.npy', V) np.save(self.save_path + filename + '_U_ep_numpy.npy', U) np.save(self.save_path + filename + '_timing_ep_numpy.npy', tm) if self.model == 'EnsemblePursuit_pytorch': X = subtract_spont(spont, resp).T options_dict = { 'seed_neuron_av_nr': 100, 'min_assembly_size': 8 } ep_pt = EnsemblePursuitPyTorch( n_ensembles=self.nr_of_components, lambd=self.lambd_, options_dict=options_dict) start = time.time() U, V = ep_pt.fit_transform(X) end = time.time() tm = end - start print('Time', tm) np.save(self.save_path + filename + '_V_ep_pytorch.npy', V) np.save(self.save_path + filename + '_U_ep_pytorch.npy', U) np.save(self.save_path + filename + '_timing_ep_pytorch.npy', tm) if self.model == 'EnsemblePursuit_adaptive': X = subtract_spont(spont, resp).T options_dict = { 'seed_neuron_av_nr': 100, 'min_assembly_size': 8 } ep_pt = EnsemblePursuitPyTorch( n_ensembles=self.nr_of_components, lambd=self.lambd_, options_dict=options_dict) start = time.time() U, V = ep_pt.fit_transform(X) end = time.time() tm = end - start print('Time', tm) np.save(self.save_path + filename + '_V_ep_adaptive.npy', V) np.save(self.save_path + filename + '_U_ep_adaptive.npy', U) if self.model == 'SparsePCA': X = subtract_spont(spont, resp) X = zscore(X) sPCA = SparsePCA(n_components=self.nr_of_components, random_state=7, max_iter=100, n_jobs=-1, verbose=1) start = time.time() model = sPCA.fit(X) end = time.time() elapsed_time = end - start U = model.components_ V = sPCA.transform(X) np.save(self.save_path + filename + '_U_sPCA.npy', U) np.save(self.save_path + filename + '_V_sPCA.npy', V) np.save(self.save_path + filename + '_time_sPCA.npy', elapsed_time) if self.model == 'ICA': X = subtract_spont(spont, resp) X = zscore(X) ICA = FastICA(n_components=self.nr_of_components, random_state=7) start = time.time() V = ICA.fit_transform(X) end = time.time() elapsed_time = end - start U = ICA.components_ np.save(self.save_path + filename + '_U_ICA.npy', U) np.save(self.save_path + filename + '_V_ICA.npy', V) np.save(self.save_path + filename + '_time_ICA.npy', elapsed_time)
def CharacterTrajectories(): symbols = [ 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 'u', 'v', 'w', 'y', 'z' ] statecounts = [4, 3, 2, 4, 3, 4, 3, 2, 6, 4, 3, 3, 4, 3, 3, 4, 2, 4, 2, 3] gaussiancounts = [ 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2 ] CT = np.load( '/home/scw4750/songbinxu/datasets/CharacterTrajectories/CharacterTrajectories.npz' ) x_train = CT['x_train'][:, :, :2] x_train = [remove_padding(seq) for seq in x_train] x_train = [filtering(seq, window=5) for seq in x_train] x_train = [zscore(seq) for seq in x_train] y_train = CT['y_train'] '''train''' # for label in range(20): # sub_data = get_oneclass_data(x_train, y_train, label) # chmm = CHMM(sub_data, state_num=statecounts[label], gaussian_num=gaussiancounts[label], # name='character_'+symbols[label], simplify=False) # chmm.train(500) # chmm.save_model('save/CharacterTrajectories/chmm_'+symbols[label]+'.npz') '''test''' # result = [] # for label in range(20): # print symbols[label] # chmm = CHMM(x_train, state_num=statecounts[label], gaussian_num=gaussiancounts[label], # name='character_'+symbols[label], mode='test', simplify=False) # chmm.load_model('save/CharacterTrajectories/chmm_'+symbols[label]+'.npz') # chmm.Viterbi_decode() # result.append(np.max(chmm.delta, 1)[:, None]) # pred = np.argmax(np.concatenate(result, 1), 1) # correct, total = np.sum(np.equal(y_train, pred).astype(int)), len(y_train) # print "train: correct=%d total=%d accuracy=%.4f" % (correct, total, correct/float(total)) x_test = CT['x_test'][:, :, :2] x_test = [remove_padding(seq) for seq in x_test] x_test = [filtering(seq, window=5) for seq in x_test] x_test = [zscore(seq) for seq in x_test] y_test = CT['y_test'] result = [] for label in range(20): print symbols[label], chmm = CHMM(x_test, state_num=statecounts[label], gaussian_num=gaussiancounts[label], name='character_' + symbols[label], mode='test', simplify=False) chmm.load_model('save/CharacterTrajectories/chmm_' + symbols[label] + '.npz') chmm.Viterbi_decode() result.append(np.max(chmm.delta, 1)[:, None]) pred = np.argmax(np.concatenate(result, 1), 1) correct, total = np.sum(np.equal(y_test, pred).astype(int)), len(y_test) print "test: correct=%d total=%d accuracy=%.4f" % ( correct, total, correct / float(total)) # 88.44% confuse_matrix = np.zeros((20, 20)) for i in range(len(y_test)): confuse_matrix[y_test[i], pred[i]] += 1.0 confuse_matrix = 1.0 - (confuse_matrix - np.min(confuse_matrix)) / ( np.max(confuse_matrix) - np.min(confuse_matrix)) ax = plt.subplot(111) plt.imshow(confuse_matrix, origin='lower', cmap='gray', interpolation='nearest') plt.xticks(range(20)) plt.yticks(range(20)) ax.set_xticklabels(symbols) ax.set_yticklabels(symbols) plt.savefig('save/CharacterTrajectories/confuse_matrix.png') plt.clf()
trainy_all = all_data[:, -2:] # alpha, theta X_temp, X_test, y_temp, y_test = train_test_split(trainx_all, trainy_all, test_size=test_fraction, random_state=42) X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=valid_fraction, random_state=42) # array of length n_inputs, containing the mean of each feature train_mean = np.mean(X_train, axis=0) # array of length n_inputs, containing the standard dev of each feature train_std = np.std(X_train, axis=0) train_data = Data(torch.FloatTensor(zscore(X_train, train_mean, train_std)), torch.FloatTensor(y_train)) valid_data = Data(torch.FloatTensor(zscore(X_valid, train_mean, train_std)), torch.FloatTensor(y_valid)) test_data = Data(torch.FloatTensor(zscore(X_test, train_mean, train_std)), torch.FloatTensor(y_test)) ### ------ ### Training ### ------ def train_model(model, dset_loaders, dset_sizes, criterion, optimizer,
data_name = 'isp_routers' raw_data = load_ts_data(data_name, 'full') data = raw_data.copy() ''' Sensor Motes data sets ''' #data_name = 'motes_l' #raw_data = load_data(data_name) #data = clean_zeros(raw_data, cpy=1) ''' Data Preprocessing ''' """ Data is loaded into memory, mean centered and standardised then converted to an iterable to read by the CD-ST each iteration""" #data = zscore_win(data, 100) # Sliding window implimentation data = zscore(data) # Batch method implimentation data = np.nan_to_num(data) z_iter = iter(data) numStreams = data.shape[1] ''' Initialise CDST Algorithm ''' CDST_alg = CDST('F-FHST.A-SREboth', p, numStreams) ''' Main Loop ''' for zt in z_iter: zt = zt.reshape(zt.shape[0],1) # Convert to a column Vector # Reset anomaly flag if last iteration flagged anomaly if np.any(CDST_alg.st['anomaly']): CDST_alg.st['anomaly'][:] = False