Example #1
0
# PCA ------------------------------------------------
svd_prot = TruncatedSVD(n_components=50, random_state=42)
lsa_ll_prot = svd_prot.fit_transform(ll_prot_sparse)
lsa_rr_prot = svd_prot.transform(rr_prot_sparse)
print(svd_prot.explained_variance_ratio_.sum())

lsa_x_prot = np.concatenate((lsa_ll_prot, lsa_rr_prot), axis=1)

x = np.concatenate((lsa_x, lsa_x_prot), axis=1)

# one-hot-encode the target
mlb_y = MultiLabelBinarizer()
y = mlb_y.fit_transform(labels)


# prepare training ------------------------------------------

hist = HistGradientBoostingClassifier()
bayes = GaussianNB()
lr = LogisticRegression(max_iter=1000)

f1, auroc, auprc, ap50, freq = training_with_split(hist, x, y)


# training -----------------------------------------------

df = pd.DataFrame(
    {'auprc': auprc, 'auroc': auroc, 'ap50': ap50, 'f1_score': f1, 'freq': freq})
df.to_csv('results/baseline_three/hist.csv')
Example #2
0
pairs = list()
for combo in sorted(combo2se.keys()):
    labels.append(list(combo2se[combo]))
    pairs.append(list(combo2stitch[combo]))

# replace drug with its id
x = list()
for pair in pairs:
    x.append([drugs_dict.get(item, item) for item in pair])

# one hot encode dataset and targets
y = MultiLabelBinarizer().fit_transform(labels)
x = MultiLabelBinarizer().fit_transform(x)

# y_mini = y[:, :10]

# training and evaluate
bayes = LogisticRegression()
f1, auroc, auprc, ap50, freq = training_with_split(bayes, x, y)

mean_auprc = sum(auprc) / len(auprc)
mean_freq = sum(freq) / len(freq)
df = pd.DataFrame({
    'auprc': auprc,
    'auroc': auroc,
    'ap50': ap50,
    'f1_score': f1,
    'freq': freq
})
df.to_csv('results/baseline_one/logistic.csv')
Example #3
0
print(svd_prot.explained_variance_ratio_.sum())

lsa_x_prot = np.concatenate((lsa_ll_prot, lsa_rr_prot), axis=1)

x = np.concatenate((lsa_x, lsa_x_prot), axis=1)

# one-hot-encode the target
mlb_y = MultiLabelBinarizer()
y = mlb_y.fit_transform(labels)

# prepare training ------------------------------------------

boost = HistGradientBoostingClassifier()
bayes = GaussianNB()
lr = LogisticRegression(max_iter=1000)

f1, auroc, auprc, freq = training_with_split(boost, x, y[:, :10])

# training -----------------------------------------------

mean_auprc = sum(auprc) / len(auprc)
mean_freq = sum(freq) / len(freq)

df = pd.DataFrame({
    'auprc': auprc,
    'auroc': auroc,
    'f1_score': f1,
    'freq': freq
})
df.to_csv('results/baseline_three/pca300_50_try_1.csv')
Example #4
0
x = np.concatenate((lsa_ll, lsa_rr), axis=1)

# x_sparse = sparse.csr_matrix(x)

# del x

# PCA ------------------------------------------------
#svd = TruncatedSVD(n_components=500, random_state=42)
#lsa_x = svd.fit_transform(x_sparse)
#print(svd.explained_variance_ratio_.sum())

# pca = PCA(n_components=0.99)
# x_pca = pca.fit_transform(x)

lr = LogisticRegression(random_state=1)

# training -----------------------------------------------
f1, auroc, auprc, ap50, freq = training_with_split(lr, x, y[:, :1])

mean_auprc = sum(auprc) / len(auprc)
mean_freq = sum(freq) / len(freq)
df = pd.DataFrame({
    'auprc': auprc,
    'auroc': auroc,
    'f1_score': f1,
    'ap50': ap50,
    'freq': freq
})
df.to_csv('results/baseline_two/logistic_try.csv')
Example #5
0
lsa_ll = svd.fit_transform(ll_sparse)
print(svd.explained_variance_ratio_.sum())

r = list()
for rig in right:
    r.append([str(mono_se_dict.get(item, item)) for item in rig])
del right, rig
# r.insert(0, [str(i) for i in range(len(mono_se_dict))])
rr = mlb.transform(r)
rr = rr[:, :10184]
del r
rr_sparse = sparse.csr_matrix(rr)
del rr
lsa_rr = svd.transform(rr_sparse)
print(svd.explained_variance_ratio_.sum())

x = np.concatenate((lsa_ll, lsa_rr), axis=1)

lr = LogisticRegression(random_state=1)
# this is much better than when I tried with less
# subsample or less estimators but it is extremelly slow
gradient_boosting = GradientBoostingClassifier(
    random_state=1, n_estimators=100, subsample=0.5, max_features='auto')

# training -----------------------------------------------
f1, auroc, auprc, ap50, freq = training_with_split(gradient_boosting, x, y[:, :10])

df = pd.DataFrame(
    {'auprc': auprc, 'auroc': auroc, 'f1_score': f1, 'ap50': ap50, 'freq': freq})
df.to_csv('results/baseline_two/gbtree_try3.csv')