Esempio n. 1
0
    def set_first_model_negatives(self, config,
                                  random_seed) -> List[TextElement]:
        """
         Randomly choose from all unlabeled instances.
        :param config: experiment config for this run
        :param random_seed: a seed for the Random being used for sampling
        :return: a list of TextElements
        """
        sampled_unlabeled_text_elements = \
            self.data_access.sample_unlabeled_text_elements(workspace_id=config.workspace_id,
                                                            dataset_name=config.train_dataset_name,
                                                            category_name=config.category_name,
                                                            sample_size=self.first_model_negatives_num,
                                                            remove_duplicates=True)['results']
        negative_uris_and_label = [(x.uri, {
            config.category_name:
            Label(LABEL_NEGATIVE, {})
        }) for x in sampled_unlabeled_text_elements]
        orchestrator_api.set_labels(config.workspace_id,
                                    negative_uris_and_label)

        negative_uris = [x.uri for x in sampled_unlabeled_text_elements]
        logging.info(
            f'set the label of {len(negative_uris_and_label)} random unlabeled instances as negatives '
            f'for category {config.category_name}')
        return negative_uris
    def run_active_learning_iteration(self, config: ExperimentParams, al,
                                      iteration):
        # get suggested elements for labeling (and their gold labels)
        suggested_text_elements, suggested_uris_and_gold_labels = \
            self.get_suggested_elements_and_gold_labels(config, al)

        # calculate metrics for the batch suggested by the active learning strategy
        al_batch_dict = self.generate_al_batch_dict(config,
                                                    suggested_text_elements)

        # set gold labels as the user-provided labels of the elements suggested by the active learning strategy
        orchestrator_api.set_labels(config.workspace_id,
                                    suggested_uris_and_gold_labels)

        # train a new model with the additional elements suggested by the active learning strategy
        new_model_id = orchestrator_api.train(config.workspace_id,
                                              config.category_name,
                                              config.model,
                                              train_params=config.train_params)
        if new_model_id is None:
            raise Exception('New model was not trained')

        # evaluate the new model
        eval_dataset = config.test_dataset_name
        res_dict = self.evaluate(config, al.name, iteration, eval_dataset,
                                 suggested_text_elements)
        res_dict.update(al_batch_dict)

        logging.info(
            f'Evaluation on dataset: {eval_dataset}, with AL: {al.name}, iteration: {iteration}, '
            f'repeat: {config.repeat_id}, model (id: {new_model_id}) is: {res_dict}\t'
            f'workspace: {config.workspace_id}')
        return res_dict, new_model_id
Esempio n. 3
0
    def set_first_model_positives(self, config,
                                  random_seed) -> List[TextElement]:
        """
        Randomly choose true positive instances.
        :param config: experiment config for this run
        :param random_seed: a seed for the Random being used for sampling
        :return: a list of TextElements
        """
        all_positives = oracle_data_access_api.sample_positives(
            config.train_dataset_name, config.category_name, 10**6,
            random_seed)
        all_without_duplicates = self.data_access.sample_text_elements(
            config.train_dataset_name, 10**6,
            remove_duplicates=True)['results']
        uris_without_dups = [element.uri for element in all_without_duplicates]
        pos_without_dups = [(uri, label) for uri, label in all_positives
                            if uri in uris_without_dups]
        selected_positives = pos_without_dups[:min(
            self.first_model_positives_num, len(pos_without_dups))]
        orchestrator_api.set_labels(config.workspace_id, selected_positives)

        positive_uris = [uri for uri, label in selected_positives]
        logging.info(
            f'set the label of {len(selected_positives)} true positive instances as positives '
            f'for category {config.category_name}')
        return positive_uris
    def train_first_model(self, config: ExperimentParams):
        if orchestrator_api.workspace_exists(config.workspace_id):
            orchestrator_api.delete_workspace(config.workspace_id)

        orchestrator_api.create_workspace(
            config.workspace_id,
            config.train_dataset_name,
            dev_dataset_name=config.dev_dataset_name)
        orchestrator_api.create_new_category(config.workspace_id,
                                             config.category_name,
                                             "No description for you")

        dev_text_elements_uris = orchestrator_api.get_all_text_elements_uris(
            config.dev_dataset_name)
        dev_text_elements_and_labels = oracle_data_access_api.get_gold_labels(
            config.dev_dataset_name, dev_text_elements_uris)
        if dev_text_elements_and_labels is not None:
            orchestrator_api.set_labels(config.workspace_id,
                                        dev_text_elements_and_labels)

        random_seed = sum([ord(c) for c in config.workspace_id])
        logging.info(str(config))
        logging.info(f'random seed: {random_seed}')

        self.set_first_model_positives(config, random_seed)
        self.set_first_model_negatives(config, random_seed)

        # train first model
        logging.info(
            f'Starting first model training (model: {config.model.name})\tworkspace: {config.workspace_id}'
        )
        new_model_id = orchestrator_api.train(config.workspace_id,
                                              config.category_name,
                                              config.model,
                                              train_params=config.train_params)
        if new_model_id is None:
            raise Exception(
                f'a new model was not trained\tworkspace: {config.workspace_id}'
            )

        eval_dataset = config.test_dataset_name
        res_dict = self.evaluate(config,
                                 al=self.NO_AL,
                                 iteration=0,
                                 eval_dataset=eval_dataset)
        res_dict.update(self.generate_al_batch_dict(
            config))  # ensures AL-related keys are in the results dictionary

        logging.info(
            f'Evaluation on dataset: {eval_dataset}, iteration: 0, first model (id: {new_model_id}) '
            f'repeat: {config.repeat_id}, is: {res_dict}\t'
            f'workspace: {config.workspace_id}')

        return res_dict
    def set_first_model_positives(self, config,
                                  random_seed) -> List[TextElement]:
        """
        Choose instances by queries, regardless of their gold label.

        :param config: experiment config for this run
        :param random_seed: a seed for the Random being used for sampling
        :return: a list of TextElements
        """
        general_dataset_name = config.train_dataset_name.split('_train')[0]
        queries = self.queries_per_dataset[general_dataset_name][
            config.category_name]
        sampled_unlabeled_text_elements = []
        for query in queries:
            sampled_unlabeled_text_elements.extend(
                self.data_access.sample_unlabeled_text_elements(
                    workspace_id=config.workspace_id,
                    dataset_name=config.train_dataset_name,
                    category_name=config.category_name,
                    sample_size=self.first_model_positives_num,
                    query=query,
                    remove_duplicates=True)['results'])
            logging.info(
                f"Positive sampling, after query {query} size is {len(sampled_unlabeled_text_elements)} "
            )

        if len(sampled_unlabeled_text_elements
               ) > self.first_model_positives_num:
            random.seed(random_seed)
            sampled_unlabeled_text_elements = random.sample(
                sampled_unlabeled_text_elements,
                self.first_model_positives_num)

        sampled_uris = [t.uri for t in sampled_unlabeled_text_elements]
        sampled_uris_and_gold_labels = dict(
            oracle_data_access_api.get_gold_labels(config.train_dataset_name,
                                                   sampled_uris))
        sampled_uris_and_label = \
            [(x.uri, {config.category_name: sampled_uris_and_gold_labels[x.uri][config.category_name]})
             for x in sampled_unlabeled_text_elements]
        orchestrator_api.set_labels(config.workspace_id,
                                    sampled_uris_and_label)

        logging.info(
            f'Set the label of {len(sampled_uris_and_label)} instances sampled by queries {queries} '
            f'using the oracle for category {config.category_name}')
        logging.info(
            f"Positive sampling, returned {len(sampled_uris)} elements")

        return sampled_uris
    def set_first_model_positives(self, config, random_seed) -> List[TextElement]:
        """
        Randomly choose instances, regardless of their gold label.
        :param config: experiment config for this run
        :param random_seed: a seed for the Random being used for sampling
        :return: a list of TextElements
        """
        sample_size = self.first_model_positives_num
        sampled_elements = self.data_access.sample_text_elements(config.train_dataset_name, sample_size,
                                                                 remove_duplicates=True)['results']
        sampled_uris = [element.uri for element in sampled_elements]
        sampled_uris_with_labels = \
            oracle_data_access_api.get_gold_labels(config.train_dataset_name, sampled_uris, config.category_name)
        orchestrator_api.set_labels(config.workspace_id, sampled_uris_with_labels)

        logging.info(f'set the label of {len(sampled_uris_with_labels)} random instances with their gold label '
                     f'(can be positive or negative) for category {config.category_name}')
        return sampled_uris
Esempio n. 7
0
    def test_copy_existing_workspace_with_labeled_data(self):
        try:
            workspace_id = "wd_id"
            new_workspace_id = "new_" + workspace_id
            orchestrator_api.delete_workspace(workspace_id, ignore_errors=True)
            orchestrator_api.delete_workspace(workspace_id, ignore_errors=True)
            dataset_name = "ds_name"
            cat_name = "cat_name"
            cat_desc = "cat_desc"
            document = generate_simple_doc(dataset_name)
            orchestrator_api.add_documents(dataset_name, [document])
            orchestrator_api.create_workspace(workspace_id, dataset_name)
            orchestrator_api.create_new_category(workspace_id, cat_name, cat_desc)
            # List[(str,mapping(str,Label))]
            uri1 = document.text_elements[0].uri
            uri2 = document.text_elements[1].uri
            labels = [(uri1, {cat_name: Label(LABEL_POSITIVE, {})}), (uri2, {cat_name: Label(LABEL_NEGATIVE, {})})]
            orchestrator_api.set_labels(workspace_id, labels)

            orchestrator_api.copy_workspace(workspace_id, new_workspace_id)
            results_original = orchestrator_api.query(workspace_id=workspace_id, dataset_name=dataset_name,
                                                      category_name=cat_name,
                                                      query="with label", unlabeled_only=False, sample_size=10)
            results_new = orchestrator_api.query(workspace_id=new_workspace_id, dataset_name=dataset_name,
                                                 category_name=cat_name,
                                                 query="with label", unlabeled_only=False, sample_size=10)

            self.assertEqual(results_original["results"], results_new["results"])

            labels = [(uri1, {cat_name: Label(LABEL_NEGATIVE, {})}), (uri2, {cat_name: Label(LABEL_POSITIVE, {})})]
            orchestrator_api.set_labels(new_workspace_id, labels)
            results_new = orchestrator_api.query(workspace_id=new_workspace_id, dataset_name=dataset_name,
                                                 category_name=cat_name,
                                                 query="with label", unlabeled_only=False, sample_size=10)
            self.assertNotEqual(results_original["results"], results_new["results"])
        finally:
            orchestrator_api.delete_workspace(workspace_id, ignore_errors=True)
            orchestrator_api.delete_workspace(new_workspace_id, ignore_errors=True)
            single_dataset_loader.clear_all_saved_files(dataset_name)