Beispiel #1
0
def merge_and_split(select_idx, new_set, train_set, test_set, seed=None):
    # get df of the train set and test set first
    train_df = to_dataframe(train_set)
    test_df = to_dataframe(test_set)
    # then get df of all molecules from the new set in this current loop
    new_data_df = to_dataframe(new_set)

    # filter out the uncertain molecules from the new set
    x_uncertain = new_data_df.X[select_idx]
    y_uncertain = new_data_df.y[select_idx]
    w_uncertain = new_data_df.w[select_idx]
    id_uncertain = new_data_df.ids[select_idx]
    # and form the uncertain df by combining the columns
    uncertain_df = pd.concat(
        [x_uncertain, y_uncertain, w_uncertain, id_uncertain], axis=1)

    # then combine all train, test, and uncertain dfs together into 1 dataframe
    total_df = pd.concat([train_df, test_df, uncertain_df], axis=0)
    total_df = total_df.reset_index(
        drop=True
    )  # TRY: resetting index to make everything consistent if it affects?
    total_set = from_dataframe(
        total_df
    )  # need to do this to make the disk dataset consistent with normally loaded disk dataset..
    final_disk_data = DiskDataset.from_numpy(
        X=total_set.X.transpose()[0],  # to keep consistent dataset shape
        y=total_set.y,
        w=total_set.w,
        ids=total_set.ids)

    # finally, do 8020 random splits of the total set
    splitter = dc.splits.RandomSplitter()
    new_tot_train, new_tot_test = splitter.train_test_split(final_disk_data,
                                                            frac_train=0.8,
                                                            seed=seed)
    return new_tot_train, new_tot_test
Beispiel #2
0
# Fit models
metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                           np.mean,
                           mode="classification")

kf = KFold(n_splits=5, shuffle=True, random_state=123)

all_train_scores = []
all_test_scores = []

start = timeit.default_timer()

for train_index, test_index in kf.split(X):

    train_dataset = DiskDataset.from_numpy(X[train_index],
                                           y[train_index, :],
                                           w[train_index, :],
                                           verbose=False)
    test_dataset = DiskDataset.from_numpy(X[test_index],
                                          y[test_index, :],
                                          w[test_index, :],
                                          verbose=False)

    # Number of features on conv-mols
    n_feat = 75
    # Batch size of models
    batch_size = 50
    '''
    graph_model = dc.nn.SequentialGraph(n_feat)
    graph_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
    graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph_model.add(dc.nn.GraphPool())
Beispiel #3
0
train_dataset, valid_dataset, test_dataset = tox21_datasets

X = train_dataset.X
y = train_dataset.y
w = train_dataset.w

kf = KFold(n_splits=5, shuffle=True, random_state=123)

all_train_scores = []
all_test_scores = []

start = timeit.default_timer()

for train_index, test_index in kf.split(X):

    train_dataset = DiskDataset.from_numpy(X[train_index,:], y[train_index, :], w[train_index, :], verbose=False)
    test_dataset = DiskDataset.from_numpy(X[test_index,:], y[test_index, :], w[test_index, :], verbose=False)

    # Fit models
    metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode='classification')

    model = dc.models.MultiTaskClassifier(
        len(tox21_tasks), train_dataset.get_data_shape()[0],
        layer_sizes=[1500], bias_init_consts=[1.], dropouts=[0.5],
        penalty=0.1, penalty_type='l2',
        learning_rate=0.001, weight_init_stddevs=[0.02],
        batch_size=50, verbosity="high")

    # Fit trained model
    model.fit(train_dataset, nb_epoch=10)