def __init__(self, ts, ticker, corpus, filter):
     self.ticker = ticker
     self.corpus = corpus
     self.dts = ts.select('price_{}'.format(ticker))[0]
     self.price = ts.select('price_{}'.format(ticker))[1]
     self.crsp = ts.select('CRSP')[1][1:]
     self.returns = np.diff(np.log(self.price))
     self.adj_returns = zscore(self.returns - self.crsp)
     self.nt = ts.select('{}_{}_{}'.format(corpus, ticker, filter))[1][1:]
     self.sent = zscore(self.nt)
     self.friday = np.array([ts.select('friday')[1][1:]])
     self.jan = np.array([ts.select('january')[1][1:]])
     self.NWD = np.array([ts.select('NWD')[1][1:]])
    def fit_model(self, bin=False):
        mt = sio.loadmat(self.data_path +
                         self.mouse_filename)  # neurons by timepoints
        self.X = mt['Fsp']
        self.motionSVD = np.array(mt['beh'][0]['face'][0]['motionSVD'][0][0]).T
        self.parea = np.array(mt['beh'][0]['pupil'][0]['area'][0][0])
        if bin == True:
            self.X, self.motionSVD, self.parea = bin_data(
                self.X, self.motionSVD, self.parea)
        else:
            self.nt = self.motionSVD.shape[1]
            tbin = 1
            self.motionSVD = np.reshape(
                self.motionSVD[:, :self.nt * tbin],
                (self.motionSVD.shape[0], self.nt, tbin)).mean(axis=-1)
            self.parea = np.reshape(self.parea[:self.nt * tbin],
                                    (self.nt, tbin)).mean(axis=-1)

        if self.model == 'EnsemblePursuit_numpy':
            options_dict = {'seed_neuron_av_nr': 100, 'min_assembly_size': 8}
            ep_np = EnsemblePursuitNumpy(n_ensembles=self.nr_of_components,
                                         lambd=0.01,
                                         options_dict=options_dict)
            U, V = ep_np.fit_transform(self.X)
            if self.save == True:
                bundle = {'U': U, 'V': V}
                np.save(
                    self.save_path + self.mouse_filename +
                    '_spont_ep_numpy.npy', bundle)
            return U, V
        if self.model == 'EnsemblePursuit_pytorch':
            options_dict = {'seed_neuron_av_nr': 100, 'min_assembly_size': 8}
            ep_np = EnsemblePursuitPyTorch(n_ensembles=self.nr_of_components,
                                           lambd=0.01,
                                           options_dict=options_dict)
            U, V = ep_np.fit_transform(self.X)
            if self.save == True:
                bundle = {'U': U, 'V': V}
                np.save(
                    self.save_path + self.mouse_filename +
                    '_spont_ep_numpy.npy', bundle)
            return U, V
        if self.model == 'NMF':
            print(self.X[self.X < 0])
            #self.X-=self.X.min(axis=0)
            self.X[self.X < 0] = 0
            self.X = self.X.T
            model = NMF(n_components=self.nr_of_components,
                        init='nndsvd',
                        random_state=7)
            V = model.fit_transform(self.X)
            U = model.components_
            return U.T, V
        if self.model == 'ICA':
            self.X = zscore(self.X)
            self.X = self.X.T
            ICA = FastICA(n_components=self.nr_of_components, random_state=7)
            V = ICA.fit_transform(self.X)
            U = ICA.components_
            return U.T, V
 def variance_explained_across_neurons(self, U, V):
     '''
     The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares
     ((y_true - y_pred) ** 2).sum() and v is the total sum of squares
     ((y_true - y_true.mean()) ** 2).sum().
     '''
     #Fetch the original data and convert it into the same form as what goes into the
     #matrix factorization model
     mt = sio.loadmat(self.data_path +
                      self.mouse_filename)  # neurons by timepoints
     X = mt['Fsp']
     if self.model == 'EnsemblePursuit_pytorch' or self.model == 'EnsemblePursuit_numpy':
         X = zscore(X.T).T
     u = []
     v = []
     approx = U @ V.T
     for j in range(X.shape[0]):
         u_j = ((X[j, :] - approx[j, :])**2).sum()
         v_j = ((X[j, :] - np.mean(X[j, :]))**2).sum()
         u.append(u_j)
         v.append(v_j)
     u = np.array(u)
     v = np.array(v)
     plt.plot(-np.divide(u, v) + 1)
     plt.title('Variance explained across neurons')
     plt.show()
     print('Total variance explained, averaged over neurons is:',
           (1 - np.mean(u) / np.mean(v)))
Example #4
0
  def plot_dat(self, anomaly, data, standardise = 1):
    """ For a given detected anomaly, plots the data around that time point """

    # Need to add input checks 

    sample_half = int(round( self.p['zt_sample_size']/2.)) 
    dat = data[anomaly-sample_half:anomaly+sample_half,:]
    
    if standardise:
      fig = plt.figure()
      ax = fig.add_subplot(111)
      ax.plot(zscore(dat))
      bpList = bp_lookup(self.p['SAX_alphabet_size'])
      for bp in bpList:
        ax.axhline(y=bp, xmin=0, xmax=dat.shape[0], ls = '--', color = 'k')
        
      adjust_spines(ax, ['bottom'])
      
      ax.set_yticklabels([])
      ax.yaxis.set_ticks([])
      ax.set_xticklabels(range(anomaly-sample_half,anomaly+sample_half+1))
      for tick in ax.xaxis.get_major_ticks():
                      tick.label.set_fontsize(18)       
      
      
    else: 
      plt.figure()
      plt.plot(dat)
      bpList = bp_lookup(self.p['SAX_alphabet_size'])
      plt.hlines(bpList, xmin=0, xmax=dat.shape[0]-1, linestyles = 'dashed', color = 'k')           
 def variance_explained_across_neurons(self, U, V):
     '''
     From sklearn:
     The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares
     ((y_true - y_pred) ** 2).sum() and v is the total sum of squares
     ((y_true - y_true.mean()) ** 2).sum().
     '''
     #Fetch the original data and convert it into the same form as what goes into the
     #matrix factorization model
     data = io.loadmat(self.data_path + self.mouse_filename)
     resp = data['stim'][0]['resp'][0]
     spont = data['stim'][0]['spont'][0]
     X = subtract_spont(spont, resp).T
     X = zscore(X.T).T
     u = []
     v = []
     approx = U @ V.T
     for j in range(X.shape[0]):
         u_j = ((X[j, :] - approx[j, :])**2).sum()
         v_j = ((X[j, :] - np.mean(X[j, :]))**2).sum()
         u.append(u_j)
         v.append(v_j)
     u = np.array(u)
     v = np.array(v)
     plt.plot(-np.divide(u, v) + 1)
     plt.title('Variance explained across neurons')
     plt.show()
     print('Total variance explained, averaged over neurons is:',
           (1 - np.mean(u) / np.mean(v)))
Example #6
0
def __init__(*args):
    repeat = int(args[0]) if len(args) >0 else 0
    if len(args) >1:
        if  type(args[1]) == str:
            date1 = datetime.strptime(args[1], '%Y-%m-%d').date()
        else:
            date1 = args[1]
    # date1 = datetime.strptime(args[1], '%Y-%m-%d').date() if len(args) >1 else None
    offset = int(args[2]) if len(args) >2 else None
    exclude_index = 1
    results = []
    need_create = {}
    for i in range(repeat):
        delta = i * REPORT_OFFSET2
        start_date = date1 - timedelta(offset*(delta+1))
        end_date = date1 - timedelta(offset*delta)
        # print(date1, start_date, end_date)
        path = os.path.join('data', 'zscore', f'{start_date}_{end_date}.json')
        try:
            results.append(load_json(path))
        except Exception as e:
            results.append({})
            need_create[i] = (start_date, end_date)
    if len(need_create):
        data_all = load_stocklist_json()
        print('len: ', len(data_all))
        for i, d in enumerate(data_all):
            if exclude_index and is_index_stock(d['codeName']):
                continue
            full_code = d['full_code']
            output = load_stock_json(full_code, start_date=datetime.now().date())['output']
            for ii, dates in need_create.items():
                z_score = zscore(output, dates[0], dates[1])
                if z_score:
                    print(i, full_code, dates[0], dates[1], z_score)
                    results[ii][full_code] = z_score
        for ii, dates in need_create.items():
            path = os.path.join('data', 'zscore', f'{dates[0]}_{dates[1]}.json')
            save_json(results[ii], path)
    return results
Example #7
0
def SAX(data, alphabet_size, word_size, minstd = 1.0, pre_normed = False):
  """ Returns one word for each data stream 
  
  word_size == Number of segments data is split into for PAA
  alphabet_size == Number of symbols used
  
  Also now compatable with a single data stream.
  """
  
  if data.ndim == 1:
    num_streams = 1
    data = np.atleast_2d(data)
    data = data.T
  else:
    num_streams = data.shape[1]    
    
  # Need to insert check here for stationary segemnts
  mask = data.std(axis=0) < minstd
  passed = np.invert(mask)
  if np.any(mask):    
    # Scale data to have a mean of 0 and a standard deviation of 1.
    if pre_normed == False:
      data[:,passed] = zscore(data[:, passed])
    symbol4skips = string.ascii_letters[int(np.ceil(alphabet_size/2.))]
  else:
    # Scale data to have a mean of 0 and a standard deviation of 1.
    if pre_normed == False:
      data = zscore(data)
  
  # Calculate our breakpoint locations.
  breakpoints = bp_lookup(alphabet_size)
  breakpoints = np.concatenate((breakpoints, np.array([np.Inf])))

  # Split the data into a list of word_size pieces.
  dataWords = np.array_split(data, word_size, axis=0)
  
  # Predifine Matrices 
  segment_means = np.zeros((word_size,num_streams))
  #segment_symbol = np.zeros((word_size,num_streams), dtype = np.str)
  p_array = np.zeros((num_streams,),dtype = ('a1,' * word_size + 'i2'))
  p_dict = {}
  
  # Calculate the mean for each section.  
  for i in range(word_size):
    segment_means[i,passed] = dataWords[i][:,passed].mean(axis = 0) 
    
  # Figure out which break each section is in based on the section_means and
  # calculated breakpoints. 
  for i in range(num_streams): 
    for j in range(word_size):
      if passed[i]:
        idx = int(np.where(breakpoints > segment_means[j,i])[0][0])
        # Store in phrase_array 
        p_array[i][j] = string.ascii_letters[idx]
      else:
        p_array[i][j] = symbol4skips
    
    # Store in phrase_dict
    phrase  = ''.join(tuple(p_array[i])[:word_size])
    if p_dict.has_key(phrase):
      p_dict[phrase].append(i)
    else:
      p_dict[phrase] = [i]

  # Put frequency of pattern in p_array
  for vals in p_dict.itervalues():
    count = len(vals)
    for i in range(count):
      p_array[vals[i]][-1] = count  
  
  return p_array, p_dict, segment_means
    def fit(self, X):
        nK = self.n_components
        lam = self.lam
        n_kmeans = self.n_kmeans

        NT, NN = X.shape

        # z-score along time dimension
        X = utils.zscore(X, axis=0)

        # convert to float64 for numerical precision
        X = np.float64(X)

        # initialize k-means clusters and compute their variance in vm
        V, vm = initialize_kmeans(X, n_kmeans, lam)

        # initialize vectors in ensemble pursuit (Vs)
        vs = np.zeros((NT, nK))

        # initialize U
        U = np.zeros((NN, nK))

        # precompute covariance matrix of neurons
        C = X.T @ X

        # keep track of number of neurons per ensemble
        ns = np.zeros(nK, )

        # time the ensemble pursuit
        t0 = time.time()

        # keep track of neuron order in ensembles

        self.order = []
        self.seed = np.zeros((NT, nK))

        self.cost_deltas = []
        #  outer loop
        for j in range(nK):
            # initialize with "biggest" k-means ensemble (by variance)
            imax = np.argmax(vm)

            # zscore the seed trace
            seed = zscore(V[:, imax])

            # fit one ensemble starting from this seed
            iorder, current_v, cost_delta_lst = new_ensemble(X, C, seed, lam)
            self.order.append(iorder)
            self.seed[:, j] = seed

            # keep track of number of neurons
            ns[j] = len(iorder)

            # normalize current_v to unit norm
            current_v /= np.sum(current_v**2)**.5

            # update column of Vs
            vs[:, j] = current_v

            # projection of each neuron onto this ensemble trace
            w = current_v @ X

            # update weights for neurons in this ensemble
            U[iorder, j] = w[iorder]

            # update activity trace
            X[:, iorder] -= np.outer(current_v, w[iorder])

            # rank one update to C using wtw
            wtw = np.outer(w[iorder], w)

            # update the columns
            C[:, iorder] -= wtw.T

            # update the rows
            C[iorder, :] -= wtw

            # add back term for the submatrix of neurons in this ensemble
            C[iorder[:, np.newaxis], iorder] += wtw[:, iorder]

            # run one round of k-means because we changed X
            V, vm = one_round_of_kmeans(V, X, lam)

            self.cost_deltas.append(cost_delta_lst)

            if j % 25 == 0 or j == nK - 1:
                print('ensemble %d, time %2.2f, nr neurons %d, EV %2.4f' %
                      (j, time.time() - t0, len(iorder), 1 - np.mean(X**2)))
        print('average sparsity is %2.4f' % (np.mean(U > 1e-5)))

        self.components_ = vs
        self.weights = U
        self.residual_kmeans = V

        # the fit function has to return the model
        return self
Example #9
0
    def fit_model(self):
        for filename in self.mat_file_lst:
            print(filename)
            data = io.loadmat(self.data_path + filename)
            resp = data['stim'][0]['resp'][0]
            spont = data['stim'][0]['spont'][0]
            if self.model == 'EnsemblePursuit_numpy':
                X = subtract_spont(spont, resp).T
                options_dict = {
                    'seed_neuron_av_nr': 100,
                    'min_assembly_size': 8
                }
                ep_np = EnsemblePursuitNumpy(n_ensembles=self.nr_of_components,
                                             lambd=self.lambd_,
                                             options_dict=options_dict)
                start = time.time()
                U, V = ep_np.fit_transform(X)
                end = time.time()
                tm = end - start
                print('Time', tm)
                np.save(self.save_path + filename + '_V_ep_numpy.npy', V)
                np.save(self.save_path + filename + '_U_ep_numpy.npy', U)
                np.save(self.save_path + filename + '_timing_ep_numpy.npy', tm)
            if self.model == 'EnsemblePursuit_pytorch':
                X = subtract_spont(spont, resp).T
                options_dict = {
                    'seed_neuron_av_nr': 100,
                    'min_assembly_size': 8
                }
                ep_pt = EnsemblePursuitPyTorch(
                    n_ensembles=self.nr_of_components,
                    lambd=self.lambd_,
                    options_dict=options_dict)
                start = time.time()
                U, V = ep_pt.fit_transform(X)
                end = time.time()
                tm = end - start
                print('Time', tm)
                np.save(self.save_path + filename + '_V_ep_pytorch.npy', V)
                np.save(self.save_path + filename + '_U_ep_pytorch.npy', U)
                np.save(self.save_path + filename + '_timing_ep_pytorch.npy',
                        tm)
            if self.model == 'EnsemblePursuit_adaptive':
                X = subtract_spont(spont, resp).T
                options_dict = {
                    'seed_neuron_av_nr': 100,
                    'min_assembly_size': 8
                }
                ep_pt = EnsemblePursuitPyTorch(
                    n_ensembles=self.nr_of_components,
                    lambd=self.lambd_,
                    options_dict=options_dict)
                start = time.time()
                U, V = ep_pt.fit_transform(X)
                end = time.time()
                tm = end - start
                print('Time', tm)
                np.save(self.save_path + filename + '_V_ep_adaptive.npy', V)
                np.save(self.save_path + filename + '_U_ep_adaptive.npy', U)

            if self.model == 'SparsePCA':
                X = subtract_spont(spont, resp)
                X = zscore(X)
                sPCA = SparsePCA(n_components=self.nr_of_components,
                                 random_state=7,
                                 max_iter=100,
                                 n_jobs=-1,
                                 verbose=1)
                start = time.time()
                model = sPCA.fit(X)
                end = time.time()
                elapsed_time = end - start
                U = model.components_
                V = sPCA.transform(X)
                np.save(self.save_path + filename + '_U_sPCA.npy', U)
                np.save(self.save_path + filename + '_V_sPCA.npy', V)
                np.save(self.save_path + filename + '_time_sPCA.npy',
                        elapsed_time)
            if self.model == 'ICA':
                X = subtract_spont(spont, resp)
                X = zscore(X)
                ICA = FastICA(n_components=self.nr_of_components,
                              random_state=7)
                start = time.time()
                V = ICA.fit_transform(X)
                end = time.time()
                elapsed_time = end - start
                U = ICA.components_
                np.save(self.save_path + filename + '_U_ICA.npy', U)
                np.save(self.save_path + filename + '_V_ICA.npy', V)
                np.save(self.save_path + filename + '_time_ICA.npy',
                        elapsed_time)
Example #10
0
def CharacterTrajectories():
    symbols = [
        'a', 'b', 'c', 'd', 'e', 'g', 'h', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
        's', 'u', 'v', 'w', 'y', 'z'
    ]
    statecounts = [4, 3, 2, 4, 3, 4, 3, 2, 6, 4, 3, 3, 4, 3, 3, 4, 2, 4, 2, 3]
    gaussiancounts = [
        2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2
    ]

    CT = np.load(
        '/home/scw4750/songbinxu/datasets/CharacterTrajectories/CharacterTrajectories.npz'
    )
    x_train = CT['x_train'][:, :, :2]
    x_train = [remove_padding(seq) for seq in x_train]
    x_train = [filtering(seq, window=5) for seq in x_train]
    x_train = [zscore(seq) for seq in x_train]
    y_train = CT['y_train']
    '''train'''
    # for label in range(20):
    #     sub_data = get_oneclass_data(x_train, y_train, label)
    #     chmm = CHMM(sub_data, state_num=statecounts[label], gaussian_num=gaussiancounts[label],
    #                 name='character_'+symbols[label], simplify=False)
    #     chmm.train(500)
    #     chmm.save_model('save/CharacterTrajectories/chmm_'+symbols[label]+'.npz')
    '''test'''
    # result = []
    # for label in range(20):
    #     print symbols[label]
    #     chmm = CHMM(x_train, state_num=statecounts[label], gaussian_num=gaussiancounts[label],
    #                 name='character_'+symbols[label], mode='test', simplify=False)
    #     chmm.load_model('save/CharacterTrajectories/chmm_'+symbols[label]+'.npz')
    #     chmm.Viterbi_decode()
    #     result.append(np.max(chmm.delta, 1)[:, None])
    # pred = np.argmax(np.concatenate(result, 1), 1)
    # correct, total = np.sum(np.equal(y_train, pred).astype(int)), len(y_train)
    # print "train: correct=%d total=%d accuracy=%.4f" % (correct, total, correct/float(total))

    x_test = CT['x_test'][:, :, :2]
    x_test = [remove_padding(seq) for seq in x_test]
    x_test = [filtering(seq, window=5) for seq in x_test]
    x_test = [zscore(seq) for seq in x_test]
    y_test = CT['y_test']
    result = []
    for label in range(20):
        print symbols[label],
        chmm = CHMM(x_test,
                    state_num=statecounts[label],
                    gaussian_num=gaussiancounts[label],
                    name='character_' + symbols[label],
                    mode='test',
                    simplify=False)
        chmm.load_model('save/CharacterTrajectories/chmm_' + symbols[label] +
                        '.npz')
        chmm.Viterbi_decode()
        result.append(np.max(chmm.delta, 1)[:, None])
    pred = np.argmax(np.concatenate(result, 1), 1)
    correct, total = np.sum(np.equal(y_test, pred).astype(int)), len(y_test)
    print "test: correct=%d total=%d accuracy=%.4f" % (
        correct, total, correct / float(total))  # 88.44%

    confuse_matrix = np.zeros((20, 20))
    for i in range(len(y_test)):
        confuse_matrix[y_test[i], pred[i]] += 1.0
    confuse_matrix = 1.0 - (confuse_matrix - np.min(confuse_matrix)) / (
        np.max(confuse_matrix) - np.min(confuse_matrix))

    ax = plt.subplot(111)
    plt.imshow(confuse_matrix,
               origin='lower',
               cmap='gray',
               interpolation='nearest')
    plt.xticks(range(20))
    plt.yticks(range(20))
    ax.set_xticklabels(symbols)
    ax.set_yticklabels(symbols)

    plt.savefig('save/CharacterTrajectories/confuse_matrix.png')
    plt.clf()
Example #11
0
trainy_all = all_data[:, -2:]  # alpha, theta

X_temp, X_test, y_temp, y_test = train_test_split(trainx_all,
                                                  trainy_all,
                                                  test_size=test_fraction,
                                                  random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp,
                                                      y_temp,
                                                      test_size=valid_fraction,
                                                      random_state=42)

# array of length n_inputs, containing the mean of each feature
train_mean = np.mean(X_train, axis=0)
# array of length n_inputs, containing the standard dev of each feature
train_std = np.std(X_train, axis=0)
train_data = Data(torch.FloatTensor(zscore(X_train, train_mean, train_std)),
                  torch.FloatTensor(y_train))
valid_data = Data(torch.FloatTensor(zscore(X_valid, train_mean, train_std)),
                  torch.FloatTensor(y_valid))
test_data = Data(torch.FloatTensor(zscore(X_test, train_mean, train_std)),
                 torch.FloatTensor(y_test))


### ------
### Training
### ------
def train_model(model,
                dset_loaders,
                dset_sizes,
                criterion,
                optimizer,
Example #12
0
 data_name = 'isp_routers'
 raw_data = load_ts_data(data_name, 'full')
 data = raw_data.copy()
 
 ''' Sensor Motes data sets '''
 #data_name = 'motes_l'
 #raw_data = load_data(data_name)
 #data = clean_zeros(raw_data, cpy=1)  
 
 
 ''' Data Preprocessing '''
 """ Data is loaded into memory, mean centered and standardised
 then converted to an iterable to read by the CD-ST each iteration"""
 
 #data = zscore_win(data, 100) # Sliding window implimentation
 data = zscore(data) # Batch method implimentation 
 
 data = np.nan_to_num(data) 
 z_iter = iter(data) 
 numStreams = data.shape[1]
 
 ''' Initialise CDST Algorithm '''
 CDST_alg = CDST('F-FHST.A-SREboth', p, numStreams)
 
 ''' Main Loop '''
 for zt in z_iter:
   zt = zt.reshape(zt.shape[0],1)   # Convert to a column Vector 
   
   # Reset anomaly flag if last iteration flagged anomaly
   if np.any(CDST_alg.st['anomaly']):
     CDST_alg.st['anomaly'][:] = False