コード例 #1
0
    def get_recommended_items_for_labeling(self,
                                           workspace_id,
                                           model_id,
                                           dataset_name,
                                           category_name,
                                           sample_size=1):

        data_access = get_data_access()
        unlabeled = self.get_unlabeled_data(workspace_id, dataset_name,
                                            category_name,
                                            self.max_to_consider)
        labeled = data_access.sample_labeled_text_elements(
            workspace_id, dataset_name, category_name,
            self.max_to_consider)["results"]  # s0 in original paper
        if self.max_to_consider and self.max_to_consider < len(unlabeled):
            unlabeled = unlabeled[:self.max_to_consider]
        X_train = labeled + unlabeled
        labeled_idx = np.array(list(range(len(labeled))))
        if len(labeled) == 0:
            return X_train[list(range(sample_size))]
        sampler = CoreSetMIPSampling(
            robustness_percentage=self.robustness_percentage,
            greedy=self.greedy)
        embeddings = np.array(
            orchestrator_api.infer(workspace_id, category_name,
                                   X_train)["embeddings"])
        # embeddings = np.random.random((len(X_train), 756))
        res = sampler.query(X_train=np.array(X_train),
                            labeled_idx=labeled_idx,
                            amount=sample_size,
                            representation=embeddings)
        selected_idx = np.sort(
            [idx for idx in res if idx not in set(labeled_idx)])
        res = np.array(X_train)[selected_idx[:sample_size]]
        return res.tolist()
コード例 #2
0
    def get_recommended_items_for_labeling(self,
                                           workspace_id,
                                           model_id,
                                           dataset_name,
                                           category_name,
                                           sample_size=1):
        data_access = get_data_access()
        unlabeled = self.get_unlabeled_data(workspace_id, dataset_name,
                                            category_name,
                                            self.max_to_consider)
        labeled = data_access.sample_labeled_text_elements(
            workspace_id, dataset_name, category_name,
            self.max_to_consider)["results"]

        sents_to_infer = labeled + unlabeled
        labeled_idx = np.arange(len(labeled))
        unlabeled_idx = len(labeled) + np.arange(len(unlabeled))
        embeddings = np.array(
            orchestrator_api.infer(workspace_id, category_name,
                                   sents_to_infer)["embeddings"])

        # iteratively sub-sample using the discriminative sampling routine:
        sub_sample_size = int(sample_size / self.sub_batches)
        additional_to_predict_idx = self.get_selected_indices(
            embeddings, labeled_idx, unlabeled_idx, sample_size,
            sub_sample_size)

        res = np.array(sents_to_infer)[additional_to_predict_idx]
        return res.tolist()
コード例 #3
0
def load_dataset(dataset_name: str,
                 force_new=False,
                 processor_factory=data_processor_factory) -> List[Document]:
    """
    Load the Documents of the given dataset.
    :param dataset_name:
    :param force_new: default is False.
    :param processor_factory: a factory for DataProcessorAPI. Default is data_processor_factory.
    :return:
    """

    docs_dir = utils.get_documents_dump_dir(dataset_name)
    if os.path.isdir(docs_dir) and not force_new:
        logging.info(
            f'{dataset_name}:\t\tskipping loading documents as {docs_dir} exists. '
            f'You can force a new loading by passing the parameter force_new=True'
        )
        return None
    data_access = data_access_factory.get_data_access()
    data_processor: DataProcessorAPI = processor_factory.get_data_processor(
        dataset_name)
    docs_from_preprocessor = data_processor.build_documents()
    data_access.add_documents(dataset_name=dataset_name,
                              documents=docs_from_preprocessor)
    num_of_text_elements = sum(
        [len(doc.text_elements) for doc in docs_from_preprocessor])
    logging.info(
        f'{dataset_name}:\t\tloaded {len(docs_from_preprocessor)} documents '
        f'({num_of_text_elements} text elements) under {docs_dir}')
    return docs_from_preprocessor
コード例 #4
0
 def get_per_element_score(self, items, workspace_id, model_id, dataset_name, category_name):
     from lrtc_lib.orchestrator import orchestrator_api
     data_access = get_data_access()
     labeled = data_access.sample_labeled_text_elements(workspace_id, dataset_name, category_name,
                                                        self.max_to_consider)["results"]
     train_embeddings = np.array(orchestrator_api.infer(workspace_id, category_name, labeled)["embeddings"])
     unlabeled_embeddings = np.array(orchestrator_api.infer(workspace_id, category_name, items)["embeddings"])
     pos_idx = get_pos_idx(category_name, labeled)
     pos = train_embeddings[pos_idx]
     neg = train_embeddings[[i for i in range(len(train_embeddings)) if i not in pos_idx]]
     scores = self.get_scores_from_embeddings(pos, neg, unlabeled_embeddings)
     return scores
コード例 #5
0
 def get_unlabeled_data(self, workspace_id: str, dataset_name: str, category_name: str, max_to_consider: int) \
         -> Sequence[TextElement]:
     """
     Return a list of up to *max_to_consider* elements that are unlabeled for a given dataset and category.
     :param workspace_id:
     :param dataset_name:
     :param category_name:
     :param max_to_consider:
     """
     from lrtc_lib.data_access.data_access_factory import get_data_access
     data_access = get_data_access()
     unlabeled = data_access.sample_unlabeled_text_elements(workspace_id, dataset_name, category_name,
                                                            max_to_consider, remove_duplicates=True)["results"]
     logging.info(f"Got {len(unlabeled)} unlabeled elements for active learning")
     return unlabeled
    def __init__(self, first_model_positives_num: int,
                 first_model_negatives_num: int,
                 active_learning_suggestions_num: int):
        """
        Init the ExperimentsRunner
        :param first_model_positives_num: the number of positives instances to provide for the first model.
        :param first_model_negatives_num: the number of negative instances to provide for the first model.
        :param active_learning_suggestions_num: the number of instances to be suggested by the active learning strategy
        for the training of the second model.

        """
        self.first_model_positives_num = first_model_positives_num
        self.first_model_negatives_num = first_model_negatives_num
        self.active_learning_suggestions_num = active_learning_suggestions_num
        self.data_access: DataAccessApi = data_access_factory.get_data_access()
        self.cached_first_model_scores = False
        orchestrator_api.set_training_set_selection_strategy(
            TrainingSetSelectionStrategy.ALL_LABELED)
def compute_batch_scores(config, elements):
    data_access = get_data_access()
    unlabeled = data_access.sample_unlabeled_text_elements(
        config.workspace_id, config.train_dataset_name, config.category_name,
        10**6)["results"]
    unlabeled_emb = np.array(
        orchestrator_api.infer(config.workspace_id, config.category_name,
                               unlabeled)["embeddings"])
    batch_emb = np.array(
        orchestrator_api.infer(config.workspace_id, config.category_name,
                               elements)["embeddings"])

    outlier_calculator = KnnOutlierCalculator(unlabeled_emb)
    outlier_value = outlier_calculator.compute_batch_score(batch_emb)
    representativeness_value = 1 / outlier_value
    diversity_calculator = DiversityCalculator(unlabeled_emb)
    diversity_value = diversity_calculator.compute_batch_score(batch_emb)
    return diversity_value, representativeness_value
# LICENSE: Apache License 2.0 (Apache-2.0)
# http://www.apache.org/licenses/LICENSE-2.0

import random
import unittest
from typing import List

import lrtc_lib.data_access.data_access_factory as data_access_factory
import lrtc_lib.data_access.single_dataset_loader as ds_loader
from lrtc_lib.data_access.core.data_structs import Document, TextElement, Label
from lrtc_lib.data_access.core.utils import URI_SEP
from lrtc_lib.data_access.data_access_in_memory import DataAccessInMemory
from lrtc_lib.orchestrator.orchestrator_api import LABEL_POSITIVE, LABEL_NEGATIVE

data_access: DataAccessInMemory = data_access_factory.get_data_access()


def generate_simple_doc(dataset_name, doc_id=0, add_duplicate=False):
    sentences = [
        'Document Title is Super Interesting',
        'First sentence is not that attractive.',
        'The second one is a bit better.',
        'Last sentence offers a promising view for the future!'
    ]
    if add_duplicate:
        sentences.append('Document Title is Super Interesting')
    text_elements = []
    start_span = 0
    for idx, sentence in enumerate(sentences):
        end_span = start_span + len(sentence)
コード例 #9
0
TRAIN_COUNTS_STR_KEY = "train_counts"
DEV_COUNTS_STR_KEY = "dev_counts"

LABEL_POSITIVE = "true"
LABEL_NEGATIVE = "false"
BINARY_LABELS = frozenset({LABEL_NEGATIVE, LABEL_POSITIVE})

# members

active_learning_strategy = PROJECT_PROPERTIES["active_learning_strategy"]
training_set_selection_strategy = PROJECT_PROPERTIES["training_set_selection"]
active_learner = PROJECT_PROPERTIES[
    "active_learning_factory"].get_active_learner(active_learning_strategy)

data_access = data_access_factory.get_data_access()

train_and_dev_sets_selector = training_set_selector_factory.get_training_set_selector(
    selector=training_set_selection_strategy)


def _delete_orphan_labels():
    """
    delete labels that are not attached to a known workspace
    """
    all_label_dump_files = glob.glob(
        get_workspace_labels_dump_filename(workspace_id='*', dataset_name='*'))
    existing_workspace_ids = [
        w.workspace_id for w in orchestrator_state_api.get_all_workspaces()
    ]
    dump_files_with_parents = [