def get_recommended_items_for_labeling(self, workspace_id, model_id, dataset_name, category_name, sample_size=1): data_access = get_data_access() unlabeled = self.get_unlabeled_data(workspace_id, dataset_name, category_name, self.max_to_consider) labeled = data_access.sample_labeled_text_elements( workspace_id, dataset_name, category_name, self.max_to_consider)["results"] # s0 in original paper if self.max_to_consider and self.max_to_consider < len(unlabeled): unlabeled = unlabeled[:self.max_to_consider] X_train = labeled + unlabeled labeled_idx = np.array(list(range(len(labeled)))) if len(labeled) == 0: return X_train[list(range(sample_size))] sampler = CoreSetMIPSampling( robustness_percentage=self.robustness_percentage, greedy=self.greedy) embeddings = np.array( orchestrator_api.infer(workspace_id, category_name, X_train)["embeddings"]) # embeddings = np.random.random((len(X_train), 756)) res = sampler.query(X_train=np.array(X_train), labeled_idx=labeled_idx, amount=sample_size, representation=embeddings) selected_idx = np.sort( [idx for idx in res if idx not in set(labeled_idx)]) res = np.array(X_train)[selected_idx[:sample_size]] return res.tolist()
def get_recommended_items_for_labeling(self, workspace_id, model_id, dataset_name, category_name, sample_size=1): data_access = get_data_access() unlabeled = self.get_unlabeled_data(workspace_id, dataset_name, category_name, self.max_to_consider) labeled = data_access.sample_labeled_text_elements( workspace_id, dataset_name, category_name, self.max_to_consider)["results"] sents_to_infer = labeled + unlabeled labeled_idx = np.arange(len(labeled)) unlabeled_idx = len(labeled) + np.arange(len(unlabeled)) embeddings = np.array( orchestrator_api.infer(workspace_id, category_name, sents_to_infer)["embeddings"]) # iteratively sub-sample using the discriminative sampling routine: sub_sample_size = int(sample_size / self.sub_batches) additional_to_predict_idx = self.get_selected_indices( embeddings, labeled_idx, unlabeled_idx, sample_size, sub_sample_size) res = np.array(sents_to_infer)[additional_to_predict_idx] return res.tolist()
def load_dataset(dataset_name: str, force_new=False, processor_factory=data_processor_factory) -> List[Document]: """ Load the Documents of the given dataset. :param dataset_name: :param force_new: default is False. :param processor_factory: a factory for DataProcessorAPI. Default is data_processor_factory. :return: """ docs_dir = utils.get_documents_dump_dir(dataset_name) if os.path.isdir(docs_dir) and not force_new: logging.info( f'{dataset_name}:\t\tskipping loading documents as {docs_dir} exists. ' f'You can force a new loading by passing the parameter force_new=True' ) return None data_access = data_access_factory.get_data_access() data_processor: DataProcessorAPI = processor_factory.get_data_processor( dataset_name) docs_from_preprocessor = data_processor.build_documents() data_access.add_documents(dataset_name=dataset_name, documents=docs_from_preprocessor) num_of_text_elements = sum( [len(doc.text_elements) for doc in docs_from_preprocessor]) logging.info( f'{dataset_name}:\t\tloaded {len(docs_from_preprocessor)} documents ' f'({num_of_text_elements} text elements) under {docs_dir}') return docs_from_preprocessor
def get_per_element_score(self, items, workspace_id, model_id, dataset_name, category_name): from lrtc_lib.orchestrator import orchestrator_api data_access = get_data_access() labeled = data_access.sample_labeled_text_elements(workspace_id, dataset_name, category_name, self.max_to_consider)["results"] train_embeddings = np.array(orchestrator_api.infer(workspace_id, category_name, labeled)["embeddings"]) unlabeled_embeddings = np.array(orchestrator_api.infer(workspace_id, category_name, items)["embeddings"]) pos_idx = get_pos_idx(category_name, labeled) pos = train_embeddings[pos_idx] neg = train_embeddings[[i for i in range(len(train_embeddings)) if i not in pos_idx]] scores = self.get_scores_from_embeddings(pos, neg, unlabeled_embeddings) return scores
def get_unlabeled_data(self, workspace_id: str, dataset_name: str, category_name: str, max_to_consider: int) \ -> Sequence[TextElement]: """ Return a list of up to *max_to_consider* elements that are unlabeled for a given dataset and category. :param workspace_id: :param dataset_name: :param category_name: :param max_to_consider: """ from lrtc_lib.data_access.data_access_factory import get_data_access data_access = get_data_access() unlabeled = data_access.sample_unlabeled_text_elements(workspace_id, dataset_name, category_name, max_to_consider, remove_duplicates=True)["results"] logging.info(f"Got {len(unlabeled)} unlabeled elements for active learning") return unlabeled
def __init__(self, first_model_positives_num: int, first_model_negatives_num: int, active_learning_suggestions_num: int): """ Init the ExperimentsRunner :param first_model_positives_num: the number of positives instances to provide for the first model. :param first_model_negatives_num: the number of negative instances to provide for the first model. :param active_learning_suggestions_num: the number of instances to be suggested by the active learning strategy for the training of the second model. """ self.first_model_positives_num = first_model_positives_num self.first_model_negatives_num = first_model_negatives_num self.active_learning_suggestions_num = active_learning_suggestions_num self.data_access: DataAccessApi = data_access_factory.get_data_access() self.cached_first_model_scores = False orchestrator_api.set_training_set_selection_strategy( TrainingSetSelectionStrategy.ALL_LABELED)
def compute_batch_scores(config, elements): data_access = get_data_access() unlabeled = data_access.sample_unlabeled_text_elements( config.workspace_id, config.train_dataset_name, config.category_name, 10**6)["results"] unlabeled_emb = np.array( orchestrator_api.infer(config.workspace_id, config.category_name, unlabeled)["embeddings"]) batch_emb = np.array( orchestrator_api.infer(config.workspace_id, config.category_name, elements)["embeddings"]) outlier_calculator = KnnOutlierCalculator(unlabeled_emb) outlier_value = outlier_calculator.compute_batch_score(batch_emb) representativeness_value = 1 / outlier_value diversity_calculator = DiversityCalculator(unlabeled_emb) diversity_value = diversity_calculator.compute_batch_score(batch_emb) return diversity_value, representativeness_value
# LICENSE: Apache License 2.0 (Apache-2.0) # http://www.apache.org/licenses/LICENSE-2.0 import random import unittest from typing import List import lrtc_lib.data_access.data_access_factory as data_access_factory import lrtc_lib.data_access.single_dataset_loader as ds_loader from lrtc_lib.data_access.core.data_structs import Document, TextElement, Label from lrtc_lib.data_access.core.utils import URI_SEP from lrtc_lib.data_access.data_access_in_memory import DataAccessInMemory from lrtc_lib.orchestrator.orchestrator_api import LABEL_POSITIVE, LABEL_NEGATIVE data_access: DataAccessInMemory = data_access_factory.get_data_access() def generate_simple_doc(dataset_name, doc_id=0, add_duplicate=False): sentences = [ 'Document Title is Super Interesting', 'First sentence is not that attractive.', 'The second one is a bit better.', 'Last sentence offers a promising view for the future!' ] if add_duplicate: sentences.append('Document Title is Super Interesting') text_elements = [] start_span = 0 for idx, sentence in enumerate(sentences): end_span = start_span + len(sentence)
TRAIN_COUNTS_STR_KEY = "train_counts" DEV_COUNTS_STR_KEY = "dev_counts" LABEL_POSITIVE = "true" LABEL_NEGATIVE = "false" BINARY_LABELS = frozenset({LABEL_NEGATIVE, LABEL_POSITIVE}) # members active_learning_strategy = PROJECT_PROPERTIES["active_learning_strategy"] training_set_selection_strategy = PROJECT_PROPERTIES["training_set_selection"] active_learner = PROJECT_PROPERTIES[ "active_learning_factory"].get_active_learner(active_learning_strategy) data_access = data_access_factory.get_data_access() train_and_dev_sets_selector = training_set_selector_factory.get_training_set_selector( selector=training_set_selection_strategy) def _delete_orphan_labels(): """ delete labels that are not attached to a known workspace """ all_label_dump_files = glob.glob( get_workspace_labels_dump_filename(workspace_id='*', dataset_name='*')) existing_workspace_ids = [ w.workspace_id for w in orchestrator_state_api.get_all_workspaces() ] dump_files_with_parents = [