def decision_function(self, X): if hasattr(self, '_onedal_estimator'): logging.info("sklearn.svm.NuSVC.decision_function: " + get_patch_message("onedal")) return self._onedal_estimator.decision_function(X) else: logging.info("sklearn.svm.NuSVC.decision_function: " + get_patch_message("sklearn")) return sklearn_NuSVC.decision_function(self, X)
class NuSVCImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class TestNuSVCIntegration(TestCase): def setUp(self): df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv')) Xte = df.iloc[:, 1:] Xenc = pd.get_dummies(Xte, prefix_sep='') yte = df.iloc[:, 0] self.test = (Xte, yte) self.enc = (Xenc, yte) pmml = path.join(BASE_DIR, '../models/svc-cat-pima.pmml') self.clf = PMMLNuSVC(pmml) self.ref = NuSVC() self.ref.fit(Xenc, yte) def test_fit_exception(self): with self.assertRaises(Exception) as cm: self.clf.fit(np.array([[]]), np.array([])) assert str(cm.exception) == 'Not supported.' def test_more_tags(self): assert self.clf._more_tags() == NuSVC()._more_tags() def test_sklearn2pmml(self): # Export to PMML pipeline = PMMLPipeline([("classifier", self.ref)]) pipeline.fit(self.enc[0], self.enc[1]) sklearn2pmml(pipeline, "svc-sklearn2pmml.pmml", with_repr=True) try: # Import PMML model = PMMLNuSVC(pmml='svc-sklearn2pmml.pmml') # Verify classification Xenc, _ = self.enc assert np.allclose(self.ref.decision_function(Xenc), model.decision_function(Xenc)) finally: remove("svc-sklearn2pmml.pmml")
# fit the model fm = FactorizationMachineClassifier(n_components=1, fit_linear=False, random_state=0) fm.fit(X, y) # fit a NuSVC for comparison svc = NuSVC(kernel='poly', degree=2) svc.fit(X, y) # plot the decision function for each datapoint on the grid Z = fm.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) Z_svc = svc.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_svc = Z_svc.reshape(xx.shape) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contour_fm = plt.contour(xx, yy, Z, levels=[0], linewidths=2) contour_svc = plt.contour(xx, yy, Z_svc, levels=[0], linestyles='dashed') plt.scatter(X[:, 0], X[:, 1], s=30, c=y, cmap=plt.cm.Paired) plt.xticks(())
print 'standardization' #print trn_data #print tst_data #trn_data_scaled = preprocessing.scale(trn_data) #tst_data_scaled = preprocessing.scale(tst_data) scaler = preprocessing.StandardScaler().fit(trn_data) trn_data_scaled = scaler.transform(trn_data) tst_data_scaled = scaler.transform(tst_data) #print trn_data_scaled #print tst_data_scaled clf = NuSVC(nu=0.5, kernel='linear') clf.fit(trn_data_scaled, trn_label) pred_label = clf.predict(tst_data_scaled) print pred_label print clf.decision_function(tst_data_scaled) accu = sum(pred_label == tst_label) / float(len(pred_label)) if args.align_algo in ['ppca_idvclas', 'pica_idvclas']: for it in range(11): np.savez_compressed(options['working_path'] + opt_group_folder + args.align_algo + '_acc_' + str(it) + '.npz', accu=accu) else: np.savez_compressed(options['working_path'] + opt_group_folder + args.align_algo + '_acc_' + str(itr) + '.npz', accu=accu) #np.savez_compressed(options['working_path']+opt_group_folder+args.align_algo+'_acc_'+str(10)+'.npz',accu = accu) print options[ 'working_path'] + opt_group_folder + args.align_algo + '_acc_' + str( itr) + '.npz'
class Learner: #@input recurrence: Dimensionality of the feature-space # with dimension corresponding to the last n returns # this is a positive integer #@input realy_recurrent: default=False # if true: the last decision is also a dimension in the # feature space #@input label_par paramter used for labeling 'r' for returns # 'p' for prices def __init__(self, recurrence=30, w_size=20,hybrid = False): self.learner = NuSVC() #size of each training batch self.batch_size = w_size * (recurrence) #size of the sliding window for sharpé ratio self.window_size = 5 * self.batch_size #true if part of a hybrid learner self.hybrid = hybrid # the data matrix of a single batch # Data-Vector = r_1, ... r_n # with r_n := r_n - r_n-1 self.returns = list() #training data for experimental apporach self.train_dat = list() self.labels = list() self.decisions = list() self.recurrence = recurrence self.last_decision = 0 self.ready = False self.tstep = 0 self.prices = list() return def predict(self,new_price,old_price,tstep = 0): #default decision value decision = 0 #Add prices to sliding window self.prices.append(new_price) if(len(self.prices) > self.window_size): self.prices.pop(0) latest_return = new_price - old_price #add next label if(self.tstep > self.recurrence): self.labels.append(self.label_returns(latest_return)) #increment timer self.tstep += 1 #add latest return to history self.returns.append(latest_return) if(self.tstep > self.window_size): if(len(self.returns) > self.window_size): self.returns.pop(0) #if batch is full, start training if(self.tstep%self.batch_size == 0 and self.tstep != 0): self.train() #disabled this, normally for predicting prices, but performance is #worse, so this is actually dead code #setup x-vector if(self.tstep > self.recurrence): x = self.returns[len(self.returns)-self.recurrence-1:len(self.returns)-1] #set up training matrix x = np.array(x) x = x.reshape((len(x),1)) self.train_dat.append(x) x = np.transpose(x) #create decision only if svm is trained if(self.ready): decision = np.tanh(self.learner.decision_function(x)) decision = decision[0] #if the system is truly recurrent (uses the last decision input-vecotr) #append the decision self.last_decision = decision return decision #calls partial_fit() on the svm to adjust it's internal model def train(self): #setup training matrix train_dat = np.zeros((len(self.labels),self.recurrence)) for i in range(len(train_dat)): train_dat[i][:] = np.transpose(self.train_dat[i]) #np.transpose(train_dat) self.learner.fit(train_dat, self.labels) #clear the training-related strzctures self.labels = list() self.train_dat = list() self.ready = True return #calls partial_fit() on the svm to adjust it's internal model #labeling function using the complete vector #very simple, since it only detects trends depending on the mu def label_set(self,return_list): mu_current = np.mean(return_list) mu_total = np.mean(self.returns) if(mu_current >= mu_total): return 1 else: return -1 def label_returns(self,next_return): if next_return > 0: return 1 else: return -1
class Learner: #@input recurrence: Dimensionality of the feature-space # with dimension corresponding to the last n returns # this is a positive integer #@input realy_recurrent: default=False # if true: the last decision is also a dimension in the # feature space #@input label_par paramter used for labeling 'r' for returns # 'p' for prices def __init__(self, adaption=0.5, transactionCost=0.001, recurrence=35, realy_recurrent=False, w_size=20, label_par='r'): self.learner = NuSVC() self.transactionCost = transactionCost self.adaption = adaption #size of each training batch self.batch_size = 200 * (recurrence) #size of the sliding window for sharpé ratio self.window_size = w_size * self.batch_size # the data matrix of a single batch # Data-Vector = r_1, ... r_n, prediction_t-1 # with r_n := r_n - r_n-1 self.returns = list() self.labels = list() self.decisions = [0] self.weighted_returns = list() #self.rng = rnj.Learner() self.recurrence = recurrence self.last_decision = 0 self.ready = False self.tstep = 0 self.recurrent = realy_recurrent self.prices = list() self.label_par = label_par self.sharpeA_old = 1 self.sharpeB_old = 1 return def predict(self, new_price, old_price, tstep=0): latest_return = new_price - old_price #Test differen classifier #if(self.tstep == 0): # self.prices.append(old_price) self.prices.append(new_price) if (len(self.prices) > self.window_size): self.prices.pop(0) self.tstep += 1 self.returns.append(latest_return) if (self.ready): x = self.returns[len(self.returns) - self.recurrence - 1:len(self.returns) - 1] if (self.recurrent): x.append(self.last_decision) x = np.array(x) x = x.reshape((len(x), 1)) x = np.transpose(x) #maybe add previous decision later on decision = np.tanh(self.learner.decision_function(x)) else: decision = 0.5 #self.rng.predict() self.weighted_returns.append(self.last_decision * latest_return - (self.transactionCost * np.fabs(self.last_decision - decision))) if (self.tstep > self.window_size): if (len(self.returns) > self.window_size): self.returns.pop(0) if (self.tstep % self.batch_size == 0 and self.tstep != 0 and self.tstep % self.window_size == 0): self.train() self.ready = True self.decisions.append(decision) if (len(self.decisions) > self.window_size): self.decisions.pop(0) self.last_decision = decision return decision #calls partial_fit() on the svm to adjust it's internal model def train(self): returns = np.array(self.returns) returns = returns[len(returns) - (self.batch_size):] # returns = returns.reshape((100,self.recurrence)) weighted_returns = np.array(self.weighted_returns) weighted_returns = weighted_returns[len(weighted_returns) - (self.batch_size):] #weighted_returns = weighted_returns.reshape((100,self.recurrence)) decisions = np.array(self.decisions) decisions = decisions[len(decisions) - (self.batch_size):] #decisions = decisions.reshape((100,self.recurrence)) trainingMatrix = list() self.labels = list() #for i in range(len(weighted_returns)): #self.labels.append(self.label_set(weighted_returns[i],decisions[i])) for i in range(self.recurrence, len(weighted_returns) - 1): trainDat = weighted_returns[i - self.recurrence:i] self.labels.append( self.label_util(trainDat[:self.recurrence - 1], decisions[i])) trainingMatrix.append(returns[i - self.recurrence:i]) #trainingMatrix.append(trainDat) #new_returns = np.zeros((100,self.recurrence+1)) #new_returns[:,:-1] = returns #decisions = np.array(self.labels) #decisions = decisions.reshape(len(decisions),1) # new_returns[:,self.recurrence-1] = np.transpose(decisions) #if(self.recurrent): #self.learner.partial_fit(new_returns, self.labels, classes=[-1,1]) #else: # self.learner.partial_fit(returns, self.labels, classes=[-1,1]) self.learner.fit(trainingMatrix, self.labels) return #calls partial_fit() on the svm to adjust it's internal model #labeling function using the complete vector def label_set(self, return_list, decision_list): if np.mean(return_list) < 0: if np.mean(decision_list) < 0: return -1 else: return 1 else: if np.mean(decision_list) < 0: return 1 else: return -1 #alternative (and highly experiment)function for labeling def label_util(self, return_list, decision): if (self.sharpeA_old == 1): sharpeA = np.mean(self.weighted_returns) sharpeB = np.std(self.weighted_returns) else: sharpeA = self.sharpeA_old + self.adaption * ( return_list[len(return_list) - 1] - self.sharpeA_old) sharpeB = self.sharpeB_old + self.adaption * ( (return_list[len(return_list) - 1]**2) - self.sharpeB_old) performance = ( sharpeB - (return_list[len(return_list) - 1] - self.sharpeA_old)) - ( sharpeA * (return_list[len(return_list) - 1]**2 - self.sharpeB_old)) performance /= ((sharpeB - (sharpeA**2))**(3 / 2)) self.sharpeA_old = sharpeA self.sharpeB_old = sharpeB if (performance < 0): if decision < 0: return 1 else: return -1 else: if decision < 0: return -1 else: return 1 def label_last(self, return_list, decision_list): if return_list[len(return_list) - 1] < 0: if decision_list[len(return_list) - 1] < 0: return 1 else: return -1 else: if decision_list[len(return_list) - 1] < 0: return -1 else: return 1
y[flip] = ~y[flip] # fit the model fm = FactorizationMachineClassifier(n_components=1, fit_linear=False, random_state=0) fm.fit(X, y) # fit a NuSVC for comparison svc = NuSVC(kernel='poly', degree=2) svc.fit(X, y) # plot the decision function for each datapoint on the grid Z = fm.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) Z_svc = svc.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_svc = Z_svc.reshape(xx.shape) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contour_fm = plt.contour(xx, yy, Z, levels=[0], linewidths=2) contour_svc = plt.contour(xx, yy, Z_svc, levels=[0], linestyles='dashed') plt.scatter(X[:, 0], X[:, 1], s=30, c=y, cmap=plt.cm.Paired) plt.xticks(()) plt.yticks(()) plt.axis([-3, 3, -3, 3]) plt.legend((contour_fm.collections[0], contour_svc.collections[0]),
""" print 'standardization' #print trn_data #print tst_data #trn_data_scaled = preprocessing.scale(trn_data) #tst_data_scaled = preprocessing.scale(tst_data) scaler = preprocessing.StandardScaler().fit(trn_data) trn_data_scaled = scaler.transform(trn_data) tst_data_scaled = scaler.transform(tst_data) #print trn_data_scaled #print tst_data_scaled clf = NuSVC(nu=0.5, kernel = 'linear') clf.fit(trn_data_scaled, trn_label) pred_label = clf.predict(tst_data_scaled) print pred_label print clf.decision_function(tst_data_scaled) accu = sum(pred_label == tst_label)/float(len(pred_label)) if args.align_algo in ['ppca_idvclas','pica_idvclas']: for it in range(11): np.savez_compressed(options['working_path']+opt_group_folder+args.align_algo+'_acc_'+str(it)+'.npz',accu = accu) else: np.savez_compressed(options['working_path']+opt_group_folder+args.align_algo+'_acc_'+str(itr)+'.npz',accu = accu) #np.savez_compressed(options['working_path']+opt_group_folder+args.align_algo+'_acc_'+str(10)+'.npz',accu = accu) print options['working_path']+opt_group_folder+args.align_algo+'_acc_'+str(itr)+'.npz' print np.mean(accu)
class SimpleCellTypeClassifier: PICKLE_PROTOCOL_VERSION = 4 # requires Python 3.4 or higher """A cell type classifier for smoothed scRNA-Seq data.""" def __init__(self, d: int = 10, nu: float = 0.20, seed: int = 0, sub_classifiers=None, name: str = None) -> None: #sel_components = get_sel_components(d) #n_components = max(sel_components) + 1 #if n_components > d: # raise ValueError('The highest selected principal component (%d) ' # 'cannot be higher than "d" (%d).' # % (n_components, d)) if sub_classifiers is None: sub_classifiers = {} self.name = name self.d = d #self.sel_components = sel_components self.nu = nu self.seed = seed self.sub_classifiers = sub_classifiers self.transcript_count_ = None self.pca_model_ = None self.genes_ = None self.cell_labels_ = None self.svm_model_ = None @property def sel_components(self) -> List[int]: return get_sel_components(self.d) @property def num_components(self) -> int: """Returns the number of the highest selected principal component.""" return max(self.sel_components) + 1 def __str__(self) -> str: try: name = self.name except AttributeError: name = None if name is None: name_str = '(no name)' else: name_str = '"%s"' % name param_str = '\n'.join([ '- d = %d' % self.d, '- transcript_count = %s\n' % (str(self.transcript_count_)), '- nu = %s' % str(self.nu), #'- sel_components = %s' % str(self.sel_components), '- seed = %d' % self.seed ]) if self.svm_model_ is None: header_str = ('Moana cell type classifier (**untrained**)\n' '------------------------------------------') else: header_str = ('Moana cell type classifier\n' '--------------------------') clf_str = ('%s\n' % header_str + 'Name: %s\n\n' % name_str + 'Parameters:\n' '%s' % param_str) if self.svm_model_ is None: return clf_str ctype_str = self._get_ctype_str() msg = ('%s\n\n' % clf_str + 'Cell types / subtypes:\n' '%s' % ctype_str) return msg def _get_ctype_str(self, prefix='-', include_subtypes: bool = True) -> str: """Get a bullet list of all cell types / subtypes of this classifier. """ ctype_list = [] #for ctype, n in self.value_counts_.iteritems(): for ctype in sorted(self.cell_types_): #if include_n: # subtype_str = '%s %s (n=%d)' % (prefix, ctype, n) subtype_str = '%s %s' % (prefix, ctype) if include_subtypes: subclf = self.get_subclassifier(ctype, search_subclassifiers=False) if subclf is not None: subtype_str += ('\n%s' % subclf._get_ctype_str(prefix + '-')) ctype_list.append('%s' % subtype_str) return '\n'.join(ctype_list) def _require_trained_classifier(self) -> None: if self.svm_model_ is None: raise RuntimeError('You must train the classifier first!') @property def num_cells_(self) -> int: """Returns the number of cells in the training data.""" return self.cell_labels_.size @property def value_counts_(self) -> pd.DataFrame: """Returns the value counts for the training labels.""" return self.cell_labels_.value_counts() @property def classes_(self) -> List[str]: return self.cell_types_ @property def cell_types_(self) -> List[str]: self._require_trained_classifier() return self.svm_model_.classes_ @property def gene_loadings_(self) -> ExpMatrix: """Returns a matrix with the gene loadings from the selected PCs.""" data = self.pca_model_.components_.T[:, self.sel_components] dim_labels = get_component_labels(self.sel_components) loadings = ExpMatrix(genes=self.genes_, cells=dim_labels, data=data) return loadings.copy() @property def gene_coef_(self) -> ExpMatrix: """Returns a matrix with gene coefficients for SVM classifier.""" gene_loadings = self.gene_loadings_ svm_coef = self.svm_model_.coef_ n = len(self.cell_types_) clf_labels = [ '"%s" vs "%s"' % (self.cell_types_[i], self.cell_types_[j]) for i in range(n - 1) for j in range(i + 1, n) ] gene_coef = ExpMatrix(genes=self.genes_, cells=clf_labels, dtype=np.float64) for j in range(len(svm_coef)): gene_coef.iloc[:, j] = gene_loadings.values.dot(svm_coef[j]) return gene_coef.copy() @property def normalized_gene_coef_(self) -> ExpMatrix: gene_coef = self.gene_coef_ normalized_gene_coef = gene_coef.copy() svm_coef = self.svm_model_.coef_ for j in range(len(svm_coef)): sel = (svm_coef[j] >= 0) size = (svm_coef[j][sel].sum() - svm_coef[j][~sel].sum()) factor = 1 / size normalized_gene_coef.iloc[:, 0] *= factor return normalized_gene_coef def __getattribute__(self, name): if name == 'transcript_count_': try: val = super().__getattribute__(name) except AttributeError: val = super().__getattribute__('med_num_transcripts_') return val else: return super().__getattribute__(name) def fit(self, matrix: ExpMatrix, cell_labels: CellAnnVector) -> None: """Train a cell classifier for scRNA-Seq data. """ sublogger = logging.getLogger('moana.preprocess') prev_level = sublogger.level sublogger.setLevel(logging.WARNING) if not cell_labels.index.to_series().isin(matrix.cells).all(): raise ValueError('Not all cells in cell type vector are ' 'contained in the expression matrix!') if set(matrix.cells) != set(cell_labels.cells): _LOGGER.warning('Cell type vector and expression matrix do not ' 'contain the same set of cells!') # make sure the two datasets are aligned matrix = matrix.loc[:, cell_labels.index] # determine median transcript count of smoothed matrix transcript_count = float(matrix.sum(axis=0).median()) ### perform PCA _LOGGER.info('Moana training -- Performing PCA...') # normalize matrix matrix = pp.median_normalize(matrix) # perform PCA tmatrix, pca_model = pp.pca(matrix, self.num_components, seed=self.seed) # select specific principal components # (currently we always select all the PCs) tmatrix = tmatrix.iloc[self.sel_components] # report fraction of variance explained frac_variance_explained = \ pca_model.explained_variance_ratio_[self.sel_components].sum() _LOGGER.info( 'Moana training -- The selected PCs represent %.1f %% of ' 'total variance.', 100 * frac_variance_explained) # set training variables self.genes_ = matrix.genes.copy() self.transcript_count_ = transcript_count self.pca_model_ = pca_model self.cell_labels_ = cell_labels.copy() # set seed before sampling np.random.seed(self.seed) # perform semi-random oversampling to balance cluster sizes vc = cell_labels.value_counts() max_cells = vc.values[0] train_tmatrix = [] train_labels = [] for cluster_label in vc.index: sel = (cell_labels == cluster_label) sub_tmatrix = tmatrix.loc[:, sel] num_reps = max_cells // sub_tmatrix.n num_remaining_cells = max_cells - (num_reps * sub_tmatrix.n) sub_tmatrix_rep = pd.concat( [sub_tmatrix] * num_reps + [sub_tmatrix.sample(n=num_remaining_cells, axis=1)], axis=1) sub_labels_rep = CellAnnVector(cells=sub_tmatrix_rep.cells, data=[cluster_label] * max_cells) train_tmatrix.append(sub_tmatrix_rep) train_labels.append(sub_labels_rep) train_tmatrix = pd.concat(train_tmatrix, axis=1) train_labels = pd.concat(train_labels) ### Train NuSVC model self.svm_model_ = NuSVC(nu=self.nu, kernel='linear', decision_function_shape='ovo', random_state=self.seed) # intialize the model self.svm_model_.fit(train_tmatrix.T, train_labels) # train the model # report performance on training data predictions = self.predict(matrix) precision_summary = get_precision_summary(cell_labels, predictions) _LOGGER.info( 'Moana training -- SVM classifier performance (precision) ' 'on training data: %s', precision_summary) sublogger.setLevel(prev_level) def transform(self, matrix: ExpMatrix) -> ExpMatrix: """Project a matrix into the PC space defined by the training data.""" if self.svm_model_ is None: raise RuntimeError('You must train the classifier first!') tmatrix = apply_pca(matrix, self.pca_model_, self.transcript_count_, components=self.sel_components, valid_genes=self.genes_) return tmatrix def predict(self, matrix: ExpMatrix, predict_subtypes: bool = True) -> CellAnnVector: """Predict cell types.""" if self.svm_model_ is None: raise RuntimeError('You must train the classifier first!') t0 = time.time() sublogger = logging.getLogger('moana.preprocess.smooth') prev_level = sublogger.level sublogger.setLevel(logging.WARNING) sublogger2 = logging.getLogger('moana.classify.util') prev_level2 = sublogger2.level sublogger2.setLevel(logging.WARNING) _LOGGER.info( 'Moana prediction -- This classifier was trained using d=%d, ' 'nu=%s, at %s transcripts / cell.', self.d, str(self.nu), str(self.transcript_count_)) ### 1. Apply PCA _LOGGER.info( 'Moana prediction -- Data will be scaled %.2f-fold before ' 'FT-transformation and projection into PC space.', self.transcript_count_ / matrix.sum(axis=0).median()) tmatrix = apply_pca(matrix, self.pca_model_, self.transcript_count_, components=self.sel_components, valid_genes=self.genes_) ### 3. Predict cell type y = self.svm_model_.predict(tmatrix.T) predictions = CellAnnVector(cells=matrix.cells, data=y) predictions.name = 'Predicted cell type' result_str = '; '.join( '%s - %d' % (ctype, n) for ctype, n in predictions.value_counts().iteritems()) _LOGGER.info('Moana prediction -- Prediction results: %s', result_str) # 4. Apply subclassifiers (if there are any) if predict_subtypes: for ctype in self.classes_: subclf = None try: subclf = self.sub_classifiers[ctype] except KeyError: pass if subclf is None: continue matrix_sub = matrix.loc[:, predictions == ctype] if matrix_sub.n == 0: _LOGGER.info('No cells with predicted cell type "%s"!', ctype) continue _LOGGER.info('Running subclassifier for cell type "%s"...', ctype) try: pred_sub = subclf.predict(matrix_sub) except ValueError as err: # not enough cells, not enough transcripts available etc. _LOGGER.error( 'Subclassifier produced an error ("%s"), therefore ' 'skipping the prediction of subtypes.', str(err)) else: predictions.loc[pred_sub.cells] = pred_sub sublogger.setLevel(prev_level) sublogger2.setLevel(prev_level2) t1 = time.time() _LOGGER.info('Cell type prediction took %.1f s.', t1 - t0) return predictions def get_decision_function(self, matrix: ExpMatrix, cell_labels: CellAnnVector): """Calculate cell (average) decision func. values for their cluster.""" if self.svm_model_ is None: raise RuntimeError('You must train the classifier first!') # 2. Apply PCA (will select same genes) tmatrix = apply_pca(matrix, self.pca_model_, self.transcript_count_, components=self.sel_components, valid_genes=self.genes_) num_samples = cell_labels.size num_classes = len(self.classes_) # get decision function values from SVM classifier dfun_array = self.svm_model_.decision_function(tmatrix.T) # calculate all averages dfun_summed = np.zeros((num_samples, num_classes), dtype=np.float64) k = 0 for i in range(num_classes): for j in range(i + 1, num_classes): dfun_summed[:, i] += dfun_array[:, k] dfun_summed[:, j] -= dfun_array[:, k] k += 1 dfun_avg = dfun_summed / (num_classes - 1) dfun_avg = CellAnnMatrix(cells=tmatrix.cells, columns=self.classes_, data=dfun_avg) # select averages corresponding to cell cluster dfun = CellAnnVector(cells=tmatrix.cells, dtype=np.float64) for cell, ctype in cell_labels.items(): dfun.loc[cell] = dfun_avg.loc[cell, ctype] return dfun @property def has_subclassifiers(self): """Determines if the classifier has any subclassifiers.""" return any([ self.get_subclassifier(ctype) is not None for ctype in self.cell_types_ ]) def get_classifier_by_cell_type(self, cell_type: str, search_subclassifiers: bool = True): """Retrieves the (sub-)classifier for a given cell type. Raises a ValueError if the specified cell type is unknown. """ ctype_str = self._get_ctype_str(include_subtypes=search_subclassifiers) clf = None if cell_type in self.cell_types_: return self if not search_subclassifiers: raise ValueError( 'The top-level classifier does not have a cell type "%s". ' 'Valid cell types:\n%s' % (cell_type, ctype_str)) for subclf in self.sub_classifiers.values(): if subclf is None: continue try: clf = subclf.get_classifier_by_cell_type(cell_type) except ValueError: pass else: return clf raise ValueError('Neither the top-level classifier nor any of its ' 'subclassifiers has a cell type "%s". ' 'Valid cell types:\n%s' % (cell_type, ctype_str)) def add_subclassifier(self, cell_type: str, subclf, search_subclassifiers: bool = True) -> None: """Adds a subclassifier for a given cell type. If a subclassifier for the cell type has already been defined, it will be replaced and a warning message will be issued. """ if subclf is None: _LOGGER.warning('No classifier specified, will do nothing.' 'Use "remove_subclassifier" to remove an existing ' 'subclassifier') return clf = self.get_classifier_by_cell_type( cell_type, search_subclassifiers=search_subclassifiers) try: other_subclf = clf.sub_classifiers[cell_type] except KeyError: other_subclf = None if other_subclf is not None: # a subclassifier for the specified cell type already exists _LOGGER.warning( 'Replacing existing subclassifier for cell type "%s".', cell_type) clf.sub_classifiers[cell_type] = subclf _LOGGER.info('Added subclassifier for cell type "%s"', cell_type) def get_subclassifier(self, cell_type: str, must_exist: bool = False, search_subclassifiers: bool = True): """Retrieves the subclassifier for a given cell type. Optionally raises an exception if no classifier is found. Always raises an exception if the specified cell type is unknown. """ clf = self.get_classifier_by_cell_type( cell_type, search_subclassifiers=search_subclassifiers) subclf = None try: subclf = clf.sub_classifiers[cell_type] except KeyError: pass if subclf is None and must_exist: raise ValueError('Cell type "%s" has no subclassifier!' % cell_type) return subclf def remove_subclassifier(self, cell_type: str, search_subclassifiers: bool = True) -> None: """Removes the subclassifier for a given cell type. An exception is raised if no subclassifier exists for this cell type. """ clf = self.get_classifier_by_cell_type( cell_type, search_subclassifiers=search_subclassifiers) subclf = None try: subclf = clf.sub_classifiers[cell_type] except KeyError: pass if subclf is None: raise ValueError('Cell type "%s" has no subclassifier!' % cell_type) del clf.sub_classifiers[cell_type] _LOGGER.info('Removed subclassifier for cell type "%s".' % cell_type) # def predict_proba(self, matrix: ExpMatrix) -> CellAnnMatrix: # """Predict cell type probabilities.""" # if self.svm_model_ is None: # raise RuntimeError('You must fit the model first!') # # 1. Apply PCA # tmatrix = self.transform(matrix) # # 2. Predict cell type probabilities # Y = self.svm_model_.predict_proba(tmatrix.T) # clusters_sorted = self.value_counts_.index.sort_values() # pred = CellAnnMatrix(cells=matrix.cells, columns=clusters_sorted, # data=Y) # return pred def write_pickle(self, file_path: str) -> None: """Write classifier to file in pickle format.""" #pred.write_pickle('') with open(file_path, 'wb') as ofh: pickle.dump(self, ofh, self.PICKLE_PROTOCOL_VERSION) _LOGGER.info('Wrote Moana classifier to "%s".', file_path) @classmethod def read_pickle(cls, file_path: str): """Read classifier from pickle file.""" with open(file_path, 'rb') as fh: clf = pickle.load(fh) _LOGGER.info('Loaded Moana classifier from "%s".', file_path) return clf
class Learner: #@input recurrence: Dimensionality of the feature-space # with dimension corresponding to the last n returns # this is a positive integer #@input realy_recurrent: default=False # if true: the last decision is also a dimension in the # feature space #@input label_par paramter used for labeling 'r' for returns # 'p' for prices def __init__(self,adaption = 0.5,transactionCost = 0.001, recurrence=35, realy_recurrent=False, w_size=20,label_par='r'): self.learner = NuSVC() self.transactionCost = transactionCost self.adaption = adaption #size of each training batch self.batch_size = 200 * (recurrence) #size of the sliding window for sharpé ratio self.window_size = w_size * self.batch_size # the data matrix of a single batch # Data-Vector = r_1, ... r_n, prediction_t-1 # with r_n := r_n - r_n-1 self.returns = list() self.labels = list() self.decisions = [0] self.weighted_returns = list() #self.rng = rnj.Learner() self.recurrence = recurrence self.last_decision = 0 self.ready = False self.tstep = 0 self.recurrent = realy_recurrent self.prices = list() self.label_par = label_par self.sharpeA_old = 1 self.sharpeB_old = 1 return def predict(self,new_price,old_price,tstep = 0): latest_return = new_price - old_price #Test differen classifier #if(self.tstep == 0): # self.prices.append(old_price) self.prices.append(new_price) if(len(self.prices) > self.window_size): self.prices.pop(0) self.tstep += 1 self.returns.append(latest_return) if(self.ready): x = self.returns[len(self.returns)-self.recurrence-1:len(self.returns)-1] if(self.recurrent): x.append(self.last_decision) x = np.array(x) x = x.reshape((len(x),1)) x = np.transpose(x) #maybe add previous decision later on decision = np.tanh(self.learner.decision_function(x)) else: decision = 0.5#self.rng.predict() self.weighted_returns.append(self.last_decision * latest_return - (self.transactionCost*np.fabs(self.last_decision - decision))) if(self.tstep > self.window_size): if(len(self.returns) > self.window_size): self.returns.pop(0) if(self.tstep%self.batch_size == 0 and self.tstep != 0 and self.tstep%self.window_size==0): self.train() self.ready = True self.decisions.append(decision) if(len(self.decisions) > self.window_size): self.decisions.pop(0) self.last_decision = decision return decision #calls partial_fit() on the svm to adjust it's internal model def train(self): returns = np.array(self.returns) returns = returns[len(returns)-(self.batch_size):] # returns = returns.reshape((100,self.recurrence)) weighted_returns = np.array(self.weighted_returns) weighted_returns = weighted_returns[len(weighted_returns)-(self.batch_size):] #weighted_returns = weighted_returns.reshape((100,self.recurrence)) decisions = np.array(self.decisions) decisions = decisions[len(decisions)-(self.batch_size):] #decisions = decisions.reshape((100,self.recurrence)) trainingMatrix = list() self.labels = list() #for i in range(len(weighted_returns)): #self.labels.append(self.label_set(weighted_returns[i],decisions[i])) for i in range(self.recurrence,len(weighted_returns)-1): trainDat = weighted_returns[i-self.recurrence:i] self.labels.append(self.label_util(trainDat[:self.recurrence-1],decisions[i])) trainingMatrix.append(returns[i-self.recurrence:i]) #trainingMatrix.append(trainDat) #new_returns = np.zeros((100,self.recurrence+1)) #new_returns[:,:-1] = returns #decisions = np.array(self.labels) #decisions = decisions.reshape(len(decisions),1) # new_returns[:,self.recurrence-1] = np.transpose(decisions) #if(self.recurrent): #self.learner.partial_fit(new_returns, self.labels, classes=[-1,1]) #else: # self.learner.partial_fit(returns, self.labels, classes=[-1,1]) self.learner.fit(trainingMatrix, self.labels) return #calls partial_fit() on the svm to adjust it's internal model #labeling function using the complete vector def label_set(self,return_list,decision_list): if np.mean(return_list) < 0: if np.mean(decision_list) < 0: return -1 else: return 1 else: if np.mean(decision_list) < 0: return 1 else: return -1 #alternative (and highly experiment)function for labeling def label_util(self,return_list,decision): if(self.sharpeA_old == 1): sharpeA = np.mean(self.weighted_returns) sharpeB = np.std(self.weighted_returns) else: sharpeA = self.sharpeA_old + self.adaption*(return_list[len(return_list)-1] - self.sharpeA_old) sharpeB = self.sharpeB_old + self.adaption*((return_list[len(return_list)-1]**2) - self.sharpeB_old) performance = (sharpeB - (return_list[len(return_list)-1] - self.sharpeA_old)) - (sharpeA * (return_list[len(return_list)-1]**2 - self.sharpeB_old)) performance /= ((sharpeB - (sharpeA ** 2)) ** (3 / 2)) self.sharpeA_old = sharpeA self.sharpeB_old = sharpeB if(performance < 0): if decision < 0: return 1 else: return -1 else: if decision < 0: return -1 else: return 1 def label_last(self,return_list,decision_list): if return_list[len(return_list)-1] < 0: if decision_list[len(return_list)-1] < 0: return 1 else: return -1 else: if decision_list[len(return_list)-1] < 0: return -1 else: return 1
class Learner: #@input recurrence: Dimensionality of the feature-space # with dimension corresponding to the last n returns # this is a positive integer #@input realy_recurrent: default=False # if true: the last decision is also a dimension in the # feature space #@input label_par paramter used for labeling 'r' for returns # 'p' for prices def __init__(self, recurrence=30, w_size=20, hybrid=False): self.learner = NuSVC() #size of each training batch self.batch_size = w_size * (recurrence) #size of the sliding window for sharpé ratio self.window_size = 5 * self.batch_size #true if part of a hybrid learner self.hybrid = hybrid # the data matrix of a single batch # Data-Vector = r_1, ... r_n # with r_n := r_n - r_n-1 self.returns = list() #training data for experimental apporach self.train_dat = list() self.labels = list() self.decisions = list() self.recurrence = recurrence self.last_decision = 0 self.ready = False self.tstep = 0 self.prices = list() return def predict(self, new_price, old_price, tstep=0): #default decision value decision = 0 #Add prices to sliding window self.prices.append(new_price) if (len(self.prices) > self.window_size): self.prices.pop(0) latest_return = new_price - old_price #add next label if (self.tstep > self.recurrence): self.labels.append(self.label_returns(latest_return)) #increment timer self.tstep += 1 #add latest return to history self.returns.append(latest_return) if (self.tstep > self.window_size): if (len(self.returns) > self.window_size): self.returns.pop(0) #if batch is full, start training if (self.tstep % self.batch_size == 0 and self.tstep != 0): self.train() #disabled this, normally for predicting prices, but performance is #worse, so this is actually dead code #setup x-vector if (self.tstep > self.recurrence): x = self.returns[len(self.returns) - self.recurrence - 1:len(self.returns) - 1] #set up training matrix x = np.array(x) x = x.reshape((len(x), 1)) self.train_dat.append(x) x = np.transpose(x) #create decision only if svm is trained if (self.ready): decision = np.tanh(self.learner.decision_function(x)) decision = decision[0] #if the system is truly recurrent (uses the last decision input-vecotr) #append the decision self.last_decision = decision return decision #calls partial_fit() on the svm to adjust it's internal model def train(self): #setup training matrix train_dat = np.zeros((len(self.labels), self.recurrence)) for i in range(len(train_dat)): train_dat[i][:] = np.transpose(self.train_dat[i]) #np.transpose(train_dat) self.learner.fit(train_dat, self.labels) #clear the training-related strzctures self.labels = list() self.train_dat = list() self.ready = True return #calls partial_fit() on the svm to adjust it's internal model #labeling function using the complete vector #very simple, since it only detects trends depending on the mu def label_set(self, return_list): mu_current = np.mean(return_list) mu_total = np.mean(self.returns) if (mu_current >= mu_total): return 1 else: return -1 def label_returns(self, next_return): if next_return > 0: return 1 else: return -1