def __init__(self, X, Y): self.name = "information density" #self.model= QueryInstanceQUIRE(train_data) self.labeled = [] self.X = X self.unlabeled = np.arange(len(X)) self.densities = information_density(X, 'manhattan')
def run(X, y, n_samples_for_intial, n_queries, estimator): start_time = time.time() X_train, y_train, X_pool, y_pool = create_random_pool_and_initial_sets( X, y, n_samples_for_intial) preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE * 3) learner = ActiveLearner(estimator=estimator, X_training=X_train, y_training=y_train, query_strategy=preset_batch) initial_accuracy = learner.score(X, y) print("Initial Accuracy: ", initial_accuracy) performance_history = [initial_accuracy] f1_score = 0 index = 0 while f1_score < 0.65: index += 1 query_index, _ = learner.query(X_pool) X_candidate, y_candidate = X_pool[query_index, :], y_pool[query_index] # Get the information density matrix, sort it and pick the 3 most information dense examples info_density_matrix = information_density(X_candidate) candidate_index = info_density_matrix.argsort()[-BATCH_SIZE:][::-1] # Teach our ActiveLearner model the record it has requested. X_selected, y_selected = X_candidate[ candidate_index, :], y_candidate[candidate_index] learner.teach(X=X_selected, y=y_selected) # Remove the queried instance from the unlabeled pool. pool_idx_list = [] for idx in candidate_index: row = query_index[idx] pool_idx_list.append(row) pool_idx = np.asarray(pool_idx_list) X_pool = delete_rows_csr(X_pool, pool_idx) y_pool = np.delete(y_pool, pool_idx) # Calculate and report our model's f1_score. y_pred = learner.predict(X) f1_score = metrics.f1_score(y, y_pred, average='micro') if index % 20 == 0: print('F1 score after {n} training samples: {f1:0.4f}'.format( n=index * batch_size, f1=f1_score)) # Save our model's performance for plotting. performance_history.append(f1_score) num_of_annotated_samples = index * BATCH_SIZE print("--- %s seconds ---" % (time.time() - start_time)) return num_of_annotated_samples
def __init__(self,train_data,train_label,params_vect,init_data): self.name="information density" #self.model= QueryInstanceQUIRE(train_data) self.labeled=np.array(init_data) self.vectorizer=tfidfvec(max_features=10000,min_df=params_vect[0],ngram_range=(1, 1)) self.vectorizer.fit(train_data) self.train_data=train_data self.unlabeled=np.arange(len(train_data)) for i in init_data: indexArr = np.argwhere(self.unlabeled == i) self.unlabeled=np.delete(self.unlabeled,indexArr) X_full_Vect = self.vectorizer.transform(self.train_data) self.densities = information_density (X_full_Vect, 'manhattan')
# train_dataset = pd.read_table('./datasets/avila-tr.txt',header=None, sep=",") train_dataset = pd.read_csv('datasets/Frogs_MFCCs.csv') train_data = train_dataset.values train_features = train_data[:, :-4] train_labels = train_data[:, -1] #only for manual datasets lbl_names = np.unique(train_labels) for i in range(len(lbl_names)): train_labels[train_labels == lbl_names[i]] = i train_labels = train_labels.astype(int) X_pool = deepcopy(train_features) y_pool = deepcopy(train_labels) euc_density = information_density(X_pool, 'euclidean') train_idx = np.unique(y_pool, return_index=True)[1] X_train = X_pool[train_idx] y_train = y_pool[train_idx] X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) euc_density = np.delete(euc_density, train_idx) learners = [] def density_sampling(classifier, X_pool): query_idx = np.argmax(euc_density) return query_idx, X_pool[query_idx]
import matplotlib.pyplot as plt from modAL.density import similarize_distance, information_density from sklearn.datasets import make_blobs from scipy.spatial.distance import euclidean X, y = make_blobs(n_features=2, n_samples=1000, centers=3, random_state=0, cluster_std=0.7) cosine_density = information_density(X) euclidean_density = information_density(X, similarize_distance(euclidean)) # visualizing the cosine and euclidean information density with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) plt.subplot(1, 2, 1) plt.scatter(x=X[:, 0], y=X[:, 1], c=cosine_density, cmap='viridis', s=50) plt.title('The cosine information density') plt.colorbar() plt.subplot(1, 2, 2) plt.scatter(x=X[:, 0], y=X[:, 1], c=euclidean_density, cmap='viridis', s=50) plt.title('The euclidean information density') plt.colorbar() plt.show()
def density_sampling(classifier, X_pool): cosine_density = information_density(X_pool, 'euclidean') query_idx = np.argmax(cosine_density) return query_idx, X_pool[query_idx]
import matplotlib.pyplot as plt from modAL.density import information_density from sklearn.datasets import make_blobs X, y = make_blobs(n_features=2, n_samples=1000, centers=3, random_state=0, cluster_std=0.7) cosine_density = information_density(X, 'cosine') euclidean_density = information_density(X, 'euclidean') # visualizing the cosine and euclidean information density with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) plt.subplot(1, 2, 1) plt.scatter(x=X[:, 0], y=X[:, 1], c=cosine_density, cmap='viridis', s=50) plt.title('The cosine information density') plt.colorbar() plt.subplot(1, 2, 2) plt.scatter(x=X[:, 0], y=X[:, 1], c=euclidean_density, cmap='viridis', s=50) plt.title('The euclidean information density') plt.colorbar() plt.show()
def density_sampling(classifier, X): euc_density = information_density(X_pool, 'euclidean') query_idx = np.argmax(euc_density) return query_idx, X[query_idx]