# PCA ------------------------------------------------ svd_prot = TruncatedSVD(n_components=50, random_state=42) lsa_ll_prot = svd_prot.fit_transform(ll_prot_sparse) lsa_rr_prot = svd_prot.transform(rr_prot_sparse) print(svd_prot.explained_variance_ratio_.sum()) lsa_x_prot = np.concatenate((lsa_ll_prot, lsa_rr_prot), axis=1) x = np.concatenate((lsa_x, lsa_x_prot), axis=1) # one-hot-encode the target mlb_y = MultiLabelBinarizer() y = mlb_y.fit_transform(labels) # prepare training ------------------------------------------ hist = HistGradientBoostingClassifier() bayes = GaussianNB() lr = LogisticRegression(max_iter=1000) f1, auroc, auprc, ap50, freq = training_with_split(hist, x, y) # training ----------------------------------------------- df = pd.DataFrame( {'auprc': auprc, 'auroc': auroc, 'ap50': ap50, 'f1_score': f1, 'freq': freq}) df.to_csv('results/baseline_three/hist.csv')
pairs = list() for combo in sorted(combo2se.keys()): labels.append(list(combo2se[combo])) pairs.append(list(combo2stitch[combo])) # replace drug with its id x = list() for pair in pairs: x.append([drugs_dict.get(item, item) for item in pair]) # one hot encode dataset and targets y = MultiLabelBinarizer().fit_transform(labels) x = MultiLabelBinarizer().fit_transform(x) # y_mini = y[:, :10] # training and evaluate bayes = LogisticRegression() f1, auroc, auprc, ap50, freq = training_with_split(bayes, x, y) mean_auprc = sum(auprc) / len(auprc) mean_freq = sum(freq) / len(freq) df = pd.DataFrame({ 'auprc': auprc, 'auroc': auroc, 'ap50': ap50, 'f1_score': f1, 'freq': freq }) df.to_csv('results/baseline_one/logistic.csv')
print(svd_prot.explained_variance_ratio_.sum()) lsa_x_prot = np.concatenate((lsa_ll_prot, lsa_rr_prot), axis=1) x = np.concatenate((lsa_x, lsa_x_prot), axis=1) # one-hot-encode the target mlb_y = MultiLabelBinarizer() y = mlb_y.fit_transform(labels) # prepare training ------------------------------------------ boost = HistGradientBoostingClassifier() bayes = GaussianNB() lr = LogisticRegression(max_iter=1000) f1, auroc, auprc, freq = training_with_split(boost, x, y[:, :10]) # training ----------------------------------------------- mean_auprc = sum(auprc) / len(auprc) mean_freq = sum(freq) / len(freq) df = pd.DataFrame({ 'auprc': auprc, 'auroc': auroc, 'f1_score': f1, 'freq': freq }) df.to_csv('results/baseline_three/pca300_50_try_1.csv')
x = np.concatenate((lsa_ll, lsa_rr), axis=1) # x_sparse = sparse.csr_matrix(x) # del x # PCA ------------------------------------------------ #svd = TruncatedSVD(n_components=500, random_state=42) #lsa_x = svd.fit_transform(x_sparse) #print(svd.explained_variance_ratio_.sum()) # pca = PCA(n_components=0.99) # x_pca = pca.fit_transform(x) lr = LogisticRegression(random_state=1) # training ----------------------------------------------- f1, auroc, auprc, ap50, freq = training_with_split(lr, x, y[:, :1]) mean_auprc = sum(auprc) / len(auprc) mean_freq = sum(freq) / len(freq) df = pd.DataFrame({ 'auprc': auprc, 'auroc': auroc, 'f1_score': f1, 'ap50': ap50, 'freq': freq }) df.to_csv('results/baseline_two/logistic_try.csv')
lsa_ll = svd.fit_transform(ll_sparse) print(svd.explained_variance_ratio_.sum()) r = list() for rig in right: r.append([str(mono_se_dict.get(item, item)) for item in rig]) del right, rig # r.insert(0, [str(i) for i in range(len(mono_se_dict))]) rr = mlb.transform(r) rr = rr[:, :10184] del r rr_sparse = sparse.csr_matrix(rr) del rr lsa_rr = svd.transform(rr_sparse) print(svd.explained_variance_ratio_.sum()) x = np.concatenate((lsa_ll, lsa_rr), axis=1) lr = LogisticRegression(random_state=1) # this is much better than when I tried with less # subsample or less estimators but it is extremelly slow gradient_boosting = GradientBoostingClassifier( random_state=1, n_estimators=100, subsample=0.5, max_features='auto') # training ----------------------------------------------- f1, auroc, auprc, ap50, freq = training_with_split(gradient_boosting, x, y[:, :10]) df = pd.DataFrame( {'auprc': auprc, 'auroc': auroc, 'f1_score': f1, 'ap50': ap50, 'freq': freq}) df.to_csv('results/baseline_two/gbtree_try3.csv')