Ejemplo n.º 1
0
 def __init__(self, X, Y):
     self.name = "information density"
     #self.model= QueryInstanceQUIRE(train_data)
     self.labeled = []
     self.X = X
     self.unlabeled = np.arange(len(X))
     self.densities = information_density(X, 'manhattan')
def run(X, y, n_samples_for_intial, n_queries, estimator):
    start_time = time.time()

    X_train, y_train, X_pool, y_pool = create_random_pool_and_initial_sets(
        X, y, n_samples_for_intial)

    preset_batch = partial(uncertainty_batch_sampling,
                           n_instances=BATCH_SIZE * 3)

    learner = ActiveLearner(estimator=estimator,
                            X_training=X_train,
                            y_training=y_train,
                            query_strategy=preset_batch)

    initial_accuracy = learner.score(X, y)
    print("Initial Accuracy: ", initial_accuracy)
    performance_history = [initial_accuracy]

    f1_score = 0
    index = 0
    while f1_score < 0.65:
        index += 1
        query_index, _ = learner.query(X_pool)

        X_candidate, y_candidate = X_pool[query_index, :], y_pool[query_index]

        # Get the information density matrix, sort it and pick the 3 most information dense examples
        info_density_matrix = information_density(X_candidate)
        candidate_index = info_density_matrix.argsort()[-BATCH_SIZE:][::-1]

        # Teach our ActiveLearner model the record it has requested.
        X_selected, y_selected = X_candidate[
            candidate_index, :], y_candidate[candidate_index]
        learner.teach(X=X_selected, y=y_selected)

        # Remove the queried instance from the unlabeled pool.
        pool_idx_list = []
        for idx in candidate_index:
            row = query_index[idx]
            pool_idx_list.append(row)

        pool_idx = np.asarray(pool_idx_list)

        X_pool = delete_rows_csr(X_pool, pool_idx)
        y_pool = np.delete(y_pool, pool_idx)

        # Calculate and report our model's f1_score.
        y_pred = learner.predict(X)
        f1_score = metrics.f1_score(y, y_pred, average='micro')
        if index % 20 == 0:
            print('F1 score after {n} training samples: {f1:0.4f}'.format(
                n=index * batch_size, f1=f1_score))

        # Save our model's performance for plotting.
        performance_history.append(f1_score)

    num_of_annotated_samples = index * BATCH_SIZE
    print("--- %s seconds ---" % (time.time() - start_time))
    return num_of_annotated_samples
 def __init__(self,train_data,train_label,params_vect,init_data):
     self.name="information density"
     #self.model= QueryInstanceQUIRE(train_data)
     
     self.labeled=np.array(init_data)
     self.vectorizer=tfidfvec(max_features=10000,min_df=params_vect[0],ngram_range=(1, 1))
     self.vectorizer.fit(train_data)
     self.train_data=train_data
     self.unlabeled=np.arange(len(train_data))
     for i in init_data:
         indexArr = np.argwhere(self.unlabeled == i)
         self.unlabeled=np.delete(self.unlabeled,indexArr)
     X_full_Vect = self.vectorizer.transform(self.train_data)
     self.densities = information_density (X_full_Vect, 'manhattan')
Ejemplo n.º 4
0
# train_dataset = pd.read_table('./datasets/avila-tr.txt',header=None, sep=",")
train_dataset = pd.read_csv('datasets/Frogs_MFCCs.csv')
train_data = train_dataset.values
train_features = train_data[:, :-4]
train_labels = train_data[:, -1]
#only for manual datasets
lbl_names = np.unique(train_labels)

for i in range(len(lbl_names)):
    train_labels[train_labels == lbl_names[i]] = i
train_labels = train_labels.astype(int)

X_pool = deepcopy(train_features)
y_pool = deepcopy(train_labels)

euc_density = information_density(X_pool, 'euclidean')
train_idx = np.unique(y_pool, return_index=True)[1]

X_train = X_pool[train_idx]
y_train = y_pool[train_idx]

X_pool = np.delete(X_pool, train_idx, axis=0)
y_pool = np.delete(y_pool, train_idx)
euc_density = np.delete(euc_density, train_idx)

learners = []


def density_sampling(classifier, X_pool):
    query_idx = np.argmax(euc_density)
    return query_idx, X_pool[query_idx]
import matplotlib.pyplot as plt

from modAL.density import similarize_distance, information_density
from sklearn.datasets import make_blobs
from scipy.spatial.distance import euclidean

X, y = make_blobs(n_features=2,
                  n_samples=1000,
                  centers=3,
                  random_state=0,
                  cluster_std=0.7)

cosine_density = information_density(X)
euclidean_density = information_density(X, similarize_distance(euclidean))

# visualizing the cosine and euclidean information density
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    plt.subplot(1, 2, 1)
    plt.scatter(x=X[:, 0], y=X[:, 1], c=cosine_density, cmap='viridis', s=50)
    plt.title('The cosine information density')
    plt.colorbar()
    plt.subplot(1, 2, 2)
    plt.scatter(x=X[:, 0],
                y=X[:, 1],
                c=euclidean_density,
                cmap='viridis',
                s=50)
    plt.title('The euclidean information density')
    plt.colorbar()
    plt.show()
Ejemplo n.º 6
0
def density_sampling(classifier, X_pool):
    cosine_density = information_density(X_pool, 'euclidean')
    query_idx = np.argmax(cosine_density)
    return query_idx, X_pool[query_idx]
Ejemplo n.º 7
0
import matplotlib.pyplot as plt

from modAL.density import information_density
from sklearn.datasets import make_blobs

X, y = make_blobs(n_features=2,
                  n_samples=1000,
                  centers=3,
                  random_state=0,
                  cluster_std=0.7)

cosine_density = information_density(X, 'cosine')
euclidean_density = information_density(X, 'euclidean')

# visualizing the cosine and euclidean information density
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    plt.subplot(1, 2, 1)
    plt.scatter(x=X[:, 0], y=X[:, 1], c=cosine_density, cmap='viridis', s=50)
    plt.title('The cosine information density')
    plt.colorbar()
    plt.subplot(1, 2, 2)
    plt.scatter(x=X[:, 0],
                y=X[:, 1],
                c=euclidean_density,
                cmap='viridis',
                s=50)
    plt.title('The euclidean information density')
    plt.colorbar()
    plt.show()
Ejemplo n.º 8
0
def density_sampling(classifier, X):
    euc_density = information_density(X_pool, 'euclidean')
    query_idx = np.argmax(euc_density)
    return query_idx, X[query_idx]