def train_pca_svm(learning_data, pca_dims, probability=True, cache_size=3000, **svm_kwargs): (X_train, y_train, train_ids), (X_test, y_test, test_ids) = learning_data pca = TruncatedSVD(n_components=pca_dims) n_symbols = max( np.max(X_train) + 1, np.max(X_test) + 1 ) logger.info("Forming CSR Matrices") x_train, x_test = create_csr_matrix(X_train, n_symbols), create_csr_matrix(X_test, n_symbols) logger.info("Starting PCA") # pseudo-supervised PCA: fit on positive class only pca = pca.fit(x_train[y_train > 0]) x_train_pca = pca.transform(x_train) x_test_pca = pca.transform(x_test) logger.info("Starting SVM") svc = SVC(probability=probability, cache_size=cache_size, **svm_kwargs) svc.fit(x_train_pca, y_train) logger.info("Scoring SVM") score = svc.score(x_test_pca, y_test) logger.info(score) svc.test_score = score pca.n_symbols = n_symbols return svc, pca, x_train_pca, x_test_pca