def get_prediction_res(self, model_path, kernel): X_train = np.load(os.path.join(model_path, 'X_train_tree_%d.npy' % int(kernel))) X_dev = np.load(os.path.join(model_path, 'X_dev_tree_%d.npy' % int(kernel))) y_train = np.load(os.path.join(model_path, 'label_train.npy')) y_dev = np.load(os.path.join(model_path, 'label_dev.npy')) random_forest = RandomForest('permutation_test', X_train, y_train, X_dev, y_dev, test=True) random_forest.run() _, y_pred = random_forest.evaluate() return y_pred
def FV_RF(self): print("\nrunning Random Forest on Fisher Vectors") ae = AutoEncoder('fv_gmm', 0) with smart_open(os.path.join(ae.save_dir, 'model_list.txt'), 'rb', encoding='utf-8') as model_path: for line_no, line in enumerate(model_path): line = str(line).replace('\n', '') print(line_no, '\t', line[65:]) feature_name = line[65:] + '_%d' % self.kernel if os.path.isfile( os.path.join( line, 'X_train_tree_%d.npy' % self.kernel)) and os.path.isfile( os.path.join( line, 'X_dev_tree_%d.npy' % self.kernel)): X_train = np.load( os.path.join(line, 'X_train_tree_%d.npy' % self.kernel)) X_dev = np.load( os.path.join(line, 'X_dev_tree_%d.npy' % self.kernel)) y_train = np.load(os.path.join(line, 'label_train.npy')) y_dev = np.load(os.path.join(line, 'label_dev.npy')) print(X_train.shape, X_dev.shape) random_forest = RandomForest(feature_name, X_train, y_train, X_dev, y_dev, test=False) random_forest.run() y_pred_train, y_pred_dev = random_forest.evaluate() get_UAR(y_pred_train, y_train, np.array([]), 'RF', feature_name, 'single', train_set=True, test=False) get_UAR(y_pred_dev, y_dev, np.array([]), 'RF', feature_name, 'single', test=False)
def run_MFCC(self): """run classifier on MFCC feature (single modality) """ print("\nbuilding a classifier on MFCC features (both frame-level and session-level)") X_train, y_train, train_inst, X_dev, y_dev, dev_inst = load_proc_baseline_feature('MFCC', verbose=True) if self.model_name == 'RF_cv': y_train, y_dev = np.ravel(y_train), np.ravel(y_dev) train_inst, dev_inst = np.ravel(train_inst), np.ravel(dev_inst) X = np.vstack((X_train, X_dev)) y = np.hstack((y_train, y_dev)) inst = np.hstack((train_inst, dev_inst)) assert len(X) == len(y) == len(inst) cv_ids = k_fold_cv(len(X)) cv_res = [] for (ids_train, ids_dev) in cv_ids: X_train = X[ids_train] y_train = y[ids_train] X_dev = X[ids_dev] y_dev = y[ids_dev] dev_inst = inst[ids_dev] RF_MFCC = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test) RF_MFCC.run() y_pred_train, y_pred_dev = RF_MFCC.evaluate() _, session_res = get_UAR(y_pred_dev, y_dev, dev_inst, self.model_name, self.feature_name, 'baseline', baseline=True, test=True) cv_res.append(session_res) save_cv_results(cv_res, self.model_name, self.feature_name, 'baseline') print("\nupsampling training data to address class imbalance") X_train, y_train, train_inst = upsample(X_train, y_train, train_inst) print("\nobtaining sparse matrix for better classification") # X_train = sp.csr_matrix(np.vstack((X_train, X_dev))) # X_dev = sp.csr_matrix(X_dev) # y_train = np.hstack((y_train, y_dev)) X_train, X_dev = sp.csr_matrix(X_train), sp.csr_matrix(X_dev) if self.model_name == 'SVM': SVM_MFCC = LinearSVM(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test) SVM_MFCC.run() y_pred_train, y_pred_dev = SVM_MFCC.evaluate() elif self.model_name == 'RF': RF_MFCC = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test) RF_MFCC.run() y_pred_train, y_pred_dev = RF_MFCC.evaluate() get_UAR(y_pred_train, y_train, train_inst, self.model_name, self.feature_name, 'baseline', baseline=True, train_set=True, test=self.test) get_UAR(y_pred_dev, y_dev, dev_inst, self.model_name, self.feature_name, 'baseline', baseline=True, test=self.test) if not self.test: get_post_probability(y_pred_dev, y_dev, dev_inst, np.array([]), self.model_name, self.feature_name)
def RF(self): print( "\nrunning RF on features selected with RF with doc2vec embeddings" ) feature_path = smart_open('./pre-trained/fusion/feature_list.txt', 'rb', encoding='utf-8') feature_list = [] for _, line in enumerate(feature_path): feature_list.append(str(line).replace('\n', '')) for _ in range(3): for feature in feature_list: _, _, y_dev, y_train = load_label() y_train = y_train.astype('int') y_dev = y_dev.astype('int') X_train = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_train.npy')) X_dev = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_dev.npy')) random_forest = RandomForest(feature, X_train, y_train, X_dev, y_dev, test=False) random_forest.run() y_pred_train, y_pred_dev = random_forest.evaluate() get_UAR(y_pred_train, y_train, np.array([]), 'RF', feature, 'multiple', train_set=True, test=False) get_UAR(y_pred_dev, y_dev, np.array([]), 'RF', feature, 'multiple', test=False)
def test_random_forest(self): import numpy as np from sklearn import datasets iris = datasets.load_iris() # load the iris dataset # NOTE that we only use first two features for 2-d plot X = iris.data[:,:2] y = iris.target indices = np.random.permutation(len(X)) test_size = 15 X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] random_forest = RandomForest('IRIS', X_train, y_train, X_test, y_test, test=True) random_forest.run()
def TEXT_RF(self): print("\nrunning Random Forest on document embeddings") text2vec = Text2Vec() with smart_open(os.path.join( text2vec.model_config['doc2vec']['save_dir'], 'model_list.txt'), 'rb', encoding='utf-8') as model_path: for line_no, line in enumerate(model_path): line = str(line).replace('\n', '') print(line_no, '\t', line[68:]) X_train = np.load(os.path.join(line, 'vectors_train.npy')) X_dev = np.load(os.path.join(line, 'vectors_dev.npy')) y_train = np.load(os.path.join(line, 'labels_train.npy')) y_dev = np.load(os.path.join(line, 'labels_dev.npy')) y_train = np.ravel(y_train) y_dev = np.ravel(y_dev) random_forest = RandomForest(line[68:], X_train, y_train, X_dev, y_dev, baseline=False) random_forest.run() y_pred_train, y_pred_dev = random_forest.evaluate() get_UAR(y_pred_train, y_train, np.array([]), 'RF', line[68:], 'single', baseline=False, train_set=True) get_UAR(y_pred_dev, y_dev, np.array([]), 'RF', line[68:], 'single', baseline=False)
def test_text2vec(self): sample = Text2Vec(build_on_corpus=False) sample.build_model() sample.train_model() sample.infer_embedding('train') sample.infer_embedding('dev') sample.load_model() X_train, y_train = sample.load_embedding('train') X_dev, y_dev = sample.load_embedding('dev') random_forest = RandomForest('text', X_train, y_train, X_dev, y_dev, test=True) random_forest.run() random_forest.evaluate() sample.evaluate_model()
def run_AU(self): """run classifier on AU feature (single modality) """ print("\nbuilding a classifier on AU features (already session-level)") X_train, y_train, _, X_dev, y_dev, _ = load_proc_baseline_feature('AU', verbose=True) if self.model_name == 'RF_cv': X = np.vstack((X_train, X_dev)) y = np.hstack((y_train, y_dev)) assert len(X) == len(y) cv_ids = k_fold_cv(len(X)) cv_res = [] for (ids_train, ids_dev) in cv_ids: X_train = X[ids_train] y_train = y[ids_train] X_dev = X[ids_dev] y_dev = y[ids_dev] RF_MFCC = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test) RF_MFCC.run() y_pred_train, y_pred_dev = RF_MFCC.evaluate() _, session_res = get_UAR(y_pred_dev, y_dev, np.array([]), self.model_name, self.feature_name, 'baseline', baseline=True, test=True) cv_res.append(session_res) save_cv_results(cv_res, self.model_name, self.feature_name, 'baseline') print("\nupsampling training data to address class imbalance") X_train, y_train, _ = upsample(X_train, y_train, np.array([])) print("\nobtaining sparse matrix for better classification") # X_train = sp.csr_matrix(np.vstack((X_train, X_dev))) # X_dev = sp.csr_matrix(X_dev) # y_train = np.hstack((y_train, y_dev)) X_train, X_dev = sp.csr_matrix(X_train), sp.csr_matrix(X_dev) if self.model_name == 'SVM': SVM_AU = LinearSVM(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test) SVM_AU.run() y_pred_train, y_pred_dev = SVM_AU.evaluate() session_prob = SVM_AU.get_session_probability() elif self.model_name == 'RF': RF_AU = RandomForest(self.feature_name, X_train, y_train, X_dev, y_dev, baseline=True, test=self.test) RF_AU.run() y_pred_train, y_pred_dev = RF_AU.evaluate() session_prob = RF_AU.get_session_probability() get_UAR(y_pred_train, y_train, np.array([]), self.model_name, self.feature_name, 'baseline', baseline=True, train_set=True, test=self.test) get_UAR(y_pred_dev, y_dev, np.array([]), self.model_name, self.feature_name, 'baseline', baseline=True, test=self.test)
def RF_CV(self): print( "\nrunning RF on features selected with RF with doc2vec embeddings" ) feature_path = smart_open('./pre-trained/fusion/feature_list.txt', 'rb', encoding='utf-8') feature_list = [] for _, line in enumerate(feature_path): feature_list.append(str(line).replace('\n', '')) from sklearn.metrics import precision_recall_fscore_support cv_results_UAR = dict() cv_results_UAP = dict() for feature in feature_list: cv_results_UAR[feature] = [] cv_results_UAP[feature] = [] _, _, y_dev, y_train = load_label() y_train = y_train.astype('int') y_dev = y_dev.astype('int') X_train = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_train.npy')) X_dev = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_dev.npy')) X = np.vstack((X_train, X_dev)) y = np.hstack((y_train, y_dev)) cv_ids = k_fold_cv(len(X)) for cv_id in cv_ids: X_train = X[cv_id[0]] y_train = y[cv_id[0]] X_dev = X[cv_id[1]] y_dev = y[cv_id[1]] print('train on %d test on %d' % (len(y_train), len(y_dev))) random_forest = RandomForest(feature, X_train, y_train, X_dev, y_dev, test=False) random_forest.run() _, y_pred = random_forest.evaluate() precision, recall, _, _ = precision_recall_fscore_support( y_dev, y_pred, average='macro') cv_results_UAR[feature].append(recall) cv_results_UAP[feature].append(precision) assert len(cv_results_UAR[feature]) == len( cv_results_UAP[feature]) == 10 with open(os.path.join('results', 'cross-validation.json'), 'a+', encoding='utf-8') as outfile: json.dump(cv_results_UAR, outfile) json.dump(cv_results_UAP, outfile)
async def predict(input: PredictRequest, clf: RandomForest = Depends(get_model)): X = np.array(input.data) y_pred = clf.predict(X) result = PredictResponse(data=y_pred.tolist()) return result
def get_model(): clf = RandomForest(model_name="rf_201209") clf.load() return clf