Beispiel #1
0
    def _process(self):
        if not os.path.isfile(self.get_raw_data_path()):
            raise Exception(
                f'{self.dataset_part.name.lower()} set file for dataset "{self.dataset_name}" not found'
            )
        all_categories = self._get_all_categories()
        df = pd.read_csv(self.get_raw_data_path(), encoding=self.encoding)

        texts_categories_contexts_doc_ids = [
            (text, category) for text, category in list(
                zip(df[self.text_col], df[self.label_col]))
        ]

        texts_categories_contexts_doc_ids = \
            add_column_or_default_to_zip(texts_categories_contexts_doc_ids, df, self.context_col, None)

        texts_categories_contexts_doc_ids = \
            add_column_or_default_to_zip(texts_categories_contexts_doc_ids, df, self.doc_id_col, 0)

        uri_to_category_labels = []
        prev_doc_id = None
        element_id = -1
        text_span_start = 0
        doc_uri_to_text_elements = defaultdict(list)
        for idx, (text, category, context,
                  doc_id) in enumerate(texts_categories_contexts_doc_ids):
            if prev_doc_id is not None and prev_doc_id != doc_id:
                element_id = -1
                text_span_start = 0

            doc_uri = self.dataset_name + '_' + self.dataset_part.name.lower(
            ) + URI_SEP + str(doc_id)
            element_id += 1
            text_element_uri = doc_uri + URI_SEP + str(element_id)
            metadata = {METADATA_CONTEXT_KEY: context} if context else {}
            text_element = TextElement(uri=text_element_uri,
                                       text=text,
                                       span=[(text_span_start,
                                              (text_span_start + len(text)))],
                                       metadata=metadata,
                                       category_to_label={})
            doc_uri_to_text_elements[doc_uri].append(text_element)
            category_to_label_dict = \
                {cat: Label(labels=self.LABEL_POSITIVE, metadata={}) if cat == category
                else Label(labels=self.LABEL_NEGATIVE, metadata={}) for cat in all_categories}
            uri_to_category_labels.append(
                (text_element_uri, category_to_label_dict))
            prev_doc_id = doc_id
            text_span_start += (len(text) + 1)

        self.documents = [
            Document(uri=doc_uri, text_elements=text_elements, metadata={})
            for doc_uri, text_elements in doc_uri_to_text_elements.items()
        ]
        self.uri_category_labels = uri_to_category_labels
def generate_random_uris_and_labels(dataset_name: str, num_texts_to_label: int,
                                    categories: List[str]):
    sentences_and_labels = []
    for i in range(num_texts_to_label):
        categories_to_label = random.sample(categories,
                                            random.randint(0, len(categories)))
        labels = {
            cat: Label(labels=LABEL_POSITIVE, metadata={})
            if cat in categories_to_label else Label(labels=LABEL_NEGATIVE,
                                                     metadata={})
            for cat in categories
        }
        sentences_and_labels.append((dataset_name + URI_SEP + str(i), labels))
    return sentences_and_labels
    def test_add_and_get_gold_labels_for_category_no_dump(self):
        num_elements_to_labels = 10
        dataset_name = self.test_add_and_get_gold_labels_for_category_no_dump.__name__

        target_category = 'Autobots'
        uris_to_gold_labels_expected = generate_random_uris_and_labels(
            dataset_name, num_elements_to_labels, [target_category])
        non_target_category = 'Decepticons'
        uris_to_gold_labels = [(uri,
                                dict(
                                    labels, **{
                                        non_target_category:
                                        Label(labels=LABEL_POSITIVE,
                                              metadata={})
                                    }))
                               for uri, labels in uris_to_gold_labels_expected]

        oracle.add_gold_labels(dataset_name, uris_to_gold_labels)

        uris_to_retrieve = [
            uri for uri, labels_dict in uris_to_gold_labels_expected
        ]
        uri_to_gold_labels_found = oracle.get_gold_labels(
            dataset_name, uris_to_retrieve, target_category)

        self.assert_uri_labels_equal(uris_to_gold_labels_expected,
                                     uri_to_gold_labels_found)
        loader.clear_gold_labels_file(dataset_name)
Beispiel #4
0
def get_labels(workspace_id, dataset_name):
    global labels_in_memory
    if workspace_id not in labels_in_memory or dataset_name not in labels_in_memory[
            workspace_id]:
        file_path = utils.get_workspace_labels_dump_filename(
            workspace_id, dataset_name)
        if os.path.isfile(file_path):
            # Read dict from disk
            with open(file_path) as f:
                labels_encoded = f.read()
            simplified_dict = json.loads(labels_encoded)
            labels_dict = defaultdict(lambda: defaultdict(dict))
            labels_dict.update({
                k: {
                    category: Label(**label_dict)
                    for category, label_dict in v.items()
                }
                for k, v in simplified_dict.items()
            })
            labels_in_memory[workspace_id][dataset_name] = labels_dict
        else:
            # Save empty dict to disk
            os.makedirs(Path(file_path).parent, exist_ok=True)
            empty_dict_encoded = json.dumps(
                labels_in_memory[workspace_id][dataset_name])
            with open(file_path, 'w') as f:
                f.write(empty_dict_encoded)
    return labels_in_memory[workspace_id][dataset_name]
    def test_get_label_counts(self):
        workspace_id = 'test_get_label_counts'
        dataset_name = self.test_get_label_counts.__name__ + '_dump'
        category = 'Decepticons'
        docs = generate_corpus(dataset_name, 2)
        # add labels info for a single doc
        selected_doc = docs[0]
        texts_and_labels_list = generate_random_texts_and_labels(
            selected_doc, 5, ['Autobots'])
        if texts_and_labels_list:
            if category in texts_and_labels_list[0][1]:
                texts_and_labels_list[0][1][category].labels = frozenset(
                    LABEL_NEGATIVE)
            else:
                texts_and_labels_list[0][1][category] = Label(
                    labels=LABEL_NEGATIVE, metadata={})
        data_access.set_labels(workspace_id, texts_and_labels_list)

        category_label_counts = data_access.get_label_counts(
            workspace_id, dataset_name, category)
        for label_val, observed_count in category_label_counts.items():
            expected_count = len([
                t for t in texts_and_labels_list
                if category in t[1] and label_val in t[1][category].labels
            ])
            self.assertEqual(expected_count, observed_count,
                             f'count for {label_val} does not match.')
        ds_loader.clear_all_saved_files(dataset_name)
Beispiel #6
0
    def set_first_model_negatives(self, config,
                                  random_seed) -> List[TextElement]:
        """
         Randomly choose from all unlabeled instances.
        :param config: experiment config for this run
        :param random_seed: a seed for the Random being used for sampling
        :return: a list of TextElements
        """
        sampled_unlabeled_text_elements = \
            self.data_access.sample_unlabeled_text_elements(workspace_id=config.workspace_id,
                                                            dataset_name=config.train_dataset_name,
                                                            category_name=config.category_name,
                                                            sample_size=self.first_model_negatives_num,
                                                            remove_duplicates=True)['results']
        negative_uris_and_label = [(x.uri, {
            config.category_name:
            Label(LABEL_NEGATIVE, {})
        }) for x in sampled_unlabeled_text_elements]
        orchestrator_api.set_labels(config.workspace_id,
                                    negative_uris_and_label)

        negative_uris = [x.uri for x in sampled_unlabeled_text_elements]
        logging.info(
            f'set the label of {len(negative_uris_and_label)} random unlabeled instances as negatives '
            f'for category {config.category_name}')
        return negative_uris
Beispiel #7
0
def get_gold_labels(
        dataset_name: str,
        category_name: str = None) -> Mapping[str, Mapping[str, Label]]:
    """
    :param dataset_name: the name of the dataset from which the gold labels should be retrieved
    :param category_name: the name of the category for which label information is needed. Default is None, meaning all
    categories.
    :return: # URIs -> categories -> Label
    """
    global gold_labels_per_dataset

    if gold_labels_per_dataset is None or gold_labels_per_dataset[
            0] != dataset_name:  # not in memory
        if os.path.exists(get_labels_dump_filename(
                dataset_name)):  # try to load from disk
            with open(get_labels_dump_filename(dataset_name)) as json_file:
                text_and_gold_labels_encoded = json_file.read()
            simplified_dict = json.loads(text_and_gold_labels_encoded)
            labels_dict = {
                k: {
                    category: Label(**label_dict)
                    for category, label_dict in v.items()
                }
                for k, v in simplified_dict.items()
            }
            gold_labels_per_dataset = (dataset_name, labels_dict)
        else:  # or create an empty in-memory
            gold_labels_per_dataset = (dataset_name, nested_default_dict())

    uri_categories_and_labels_map = gold_labels_per_dataset[1]
    if category_name is not None:
        data_view_func = PROJECT_PROPERTIES["data_view_func"]
        uri_categories_and_labels_map = data_view_func(
            category_name, uri_categories_and_labels_map)
    return uri_categories_and_labels_map
def generate_random_texts_and_labels(doc: Document,
                                     num_sentences_to_label: int,
                                     categories: List[str]):
    sentences_and_labels = []
    text_elements_to_label = random.sample(
        doc.text_elements, min(num_sentences_to_label, len(doc.text_elements)))
    for elem in text_elements_to_label:
        categories_to_label = random.sample(categories,
                                            random.randint(0, len(categories)))
        labels = {
            cat: Label(labels=LABEL_POSITIVE, metadata={})
            if cat in categories_to_label else Label(labels=LABEL_NEGATIVE,
                                                     metadata={})
            for cat in categories
        }
        sentences_and_labels.append((elem.uri, labels))
    return sentences_and_labels
    def test_sample_by_query_text_elements(self):
        workspace_id = 'test_sample_by_query_text_elements'
        dataset_name = self.test_sample_by_query_text_elements.__name__ + '_dump'
        category = 'Autobots'
        query = 'sentence'
        sample_all = 10**100  # a huge sample_size to sample all elements
        doc = generate_corpus(dataset_name, 1)[0]
        # doc's elements = ['Document Title is Super Interesting', 'First sentence is not that attractive.',
        #          'The second one is a bit better.', 'Last sentence offers a promising view for the future!']
        # add labels info for a single doc
        texts_and_labels_list = [
            # 1st sent does not match query
            (doc.text_elements[0].uri, {
                category: Label(labels=LABEL_POSITIVE, metadata={})
            }),
            # 2nd sent does match query
            (doc.text_elements[1].uri, {
                category: Label(labels=LABEL_POSITIVE, metadata={})
            })
        ]
        data_access.set_labels(workspace_id, texts_and_labels_list)

        # query + unlabeled elements
        sampled_texts_res = data_access.sample_unlabeled_text_elements(
            workspace_id, dataset_name, category, sample_all, query)
        for sampled_text in sampled_texts_res['results']:
            self.assertDictEqual(sampled_text.category_to_label, {})

        # query + labeled elements
        sampled_texts_res = data_access.sample_labeled_text_elements(
            workspace_id, dataset_name, category, sample_all, query)
        self.assertEqual(
            1, len(sampled_texts_res['results']),
            f'all and only the {len(texts_and_labels_list)} labeled elements should have been sampled.'
        )
        texts_and_labels_dict = dict(texts_and_labels_list)
        for sampled_text in sampled_texts_res['results']:
            self.assertIn(
                sampled_text.uri, texts_and_labels_dict.keys(),
                f'the sampled text uri - {sampled_text.uri} - was not found in the '
                f'texts that were labeled: {texts_and_labels_dict}')
            self.assertIn(query, sampled_text.text)
        ds_loader.clear_all_saved_files(dataset_name)
Beispiel #10
0
 def _process(self):
     raw_data_file_path = self.get_raw_data_path()
     all_categories = self._get_all_categories()
     text_elements = []
     uri_to_category_labels = []
     with open(raw_data_file_path, 'r', encoding='latin-1') as f:
         labels_text_split = [
             line.rstrip().split(' ', 1) for line in f.readlines()
         ]
     texts = [
         elem[1].split(self.sep_for_idx)[0] for elem in labels_text_split
     ]
     categories_tuple = [(elem[0].split(':')[0], elem[0])
                         for elem in labels_text_split]
     texts_and_labels = list(
         zip(texts,
             categories_tuple))  # [(text, (coarse-grained, fine-grained))]
     for text_element_id, (text, categories) in enumerate(texts_and_labels):
         uri = self.doc_uri + URI_SEP + str(text_element_id)
         text_elements.append(
             TextElement(uri=uri,
                         text=text,
                         span=[(0, len(text))],
                         metadata={},
                         category_to_label={}))
         category = categories[
             1] if self.use_fine_grained_labels else categories[0]
         category_to_label_dict = {
             cat: Label(labels=self.LABEL_POSITIVE, metadata={}) if cat
             == category else Label(labels=self.LABEL_NEGATIVE, metadata={})
             for cat in all_categories
         }
         uri_to_category_labels.append((uri, category_to_label_dict))
     self.documents = [
         Document(uri=self.doc_uri,
                  text_elements=text_elements,
                  metadata={})
     ]
     self.uri_category_labels = uri_to_category_labels
Beispiel #11
0
    def test_copy_existing_workspace_with_labeled_data(self):
        try:
            workspace_id = "wd_id"
            new_workspace_id = "new_" + workspace_id
            orchestrator_api.delete_workspace(workspace_id, ignore_errors=True)
            orchestrator_api.delete_workspace(workspace_id, ignore_errors=True)
            dataset_name = "ds_name"
            cat_name = "cat_name"
            cat_desc = "cat_desc"
            document = generate_simple_doc(dataset_name)
            orchestrator_api.add_documents(dataset_name, [document])
            orchestrator_api.create_workspace(workspace_id, dataset_name)
            orchestrator_api.create_new_category(workspace_id, cat_name, cat_desc)
            # List[(str,mapping(str,Label))]
            uri1 = document.text_elements[0].uri
            uri2 = document.text_elements[1].uri
            labels = [(uri1, {cat_name: Label(LABEL_POSITIVE, {})}), (uri2, {cat_name: Label(LABEL_NEGATIVE, {})})]
            orchestrator_api.set_labels(workspace_id, labels)

            orchestrator_api.copy_workspace(workspace_id, new_workspace_id)
            results_original = orchestrator_api.query(workspace_id=workspace_id, dataset_name=dataset_name,
                                                      category_name=cat_name,
                                                      query="with label", unlabeled_only=False, sample_size=10)
            results_new = orchestrator_api.query(workspace_id=new_workspace_id, dataset_name=dataset_name,
                                                 category_name=cat_name,
                                                 query="with label", unlabeled_only=False, sample_size=10)

            self.assertEqual(results_original["results"], results_new["results"])

            labels = [(uri1, {cat_name: Label(LABEL_NEGATIVE, {})}), (uri2, {cat_name: Label(LABEL_POSITIVE, {})})]
            orchestrator_api.set_labels(new_workspace_id, labels)
            results_new = orchestrator_api.query(workspace_id=new_workspace_id, dataset_name=dataset_name,
                                                 category_name=cat_name,
                                                 query="with label", unlabeled_only=False, sample_size=10)
            self.assertNotEqual(results_original["results"], results_new["results"])
        finally:
            orchestrator_api.delete_workspace(workspace_id, ignore_errors=True)
            orchestrator_api.delete_workspace(new_workspace_id, ignore_errors=True)
            single_dataset_loader.clear_all_saved_files(dataset_name)
    def get_train_and_dev_sets(self,
                               workspace_id,
                               train_dataset_name,
                               category_name,
                               dev_dataset_name=None) -> Tuple[Tuple, Tuple]:

        train_data, train_counts = self.get_data_and_counts_for_labeled(
            workspace_id,
            train_dataset_name,
            category_name,
            remove_duplicates=True)
        dev_data, dev_counts = self.get_data_and_counts_for_labeled(
            workspace_id,
            dev_dataset_name,
            category_name,
            remove_duplicates=True)

        required_number_of_unlabeled_as_neg = MAX_VALUE
        if self.negative_ratio is not None:
            # reduce the number of samples that were labeled as negatives from the requested number of negatives
            required_number_of_unlabeled_as_neg = \
                max(0, self.negative_ratio * train_counts[self.pos_label] - train_counts.get(self.neg_label, 0))
            if required_number_of_unlabeled_as_neg > 0:
                logging.info(
                    f"Trying to add {required_number_of_unlabeled_as_neg} to meet ratio of "
                    f"{self.negative_ratio} negatives per positive")
        else:
            logging.info(f"using all unlabeled elements as negatives")

        if required_number_of_unlabeled_as_neg > 0:
            unlabeled_sample = \
                data_access.sample_unlabeled_text_elements(workspace_id=workspace_id, dataset_name=train_dataset_name,
                                                           category_name=category_name,
                                                           sample_size=required_number_of_unlabeled_as_neg,
                                                           remove_duplicates=True)
            for element in unlabeled_sample['results']:
                element.category_to_label = {
                    category_name: Label(self.neg_label, {})
                }
                train_data.append(element)
            train_counts["weak_" + self.neg_label] = len(
                unlabeled_sample['results'])

        logging.info(
            f"using {len(train_data)} for train using dataset {train_dataset_name}"
            + (f" and {len(dev_data)} for dev using dataset {dev_dataset_name}"
               if dev_data is not None else " with no dev dataset"))

        return (train_data, train_counts), (dev_data, dev_counts)
    def test_duplicates_removal(self):
        workspace_id = 'test_duplicates_removal'
        dataset_name = self.test_duplicates_removal.__name__ + '_dump'
        generate_corpus(dataset_name, 1, add_duplicate=True)
        all_elements = data_access.get_all_text_elements(dataset_name)
        all_elements2 = data_access.sample_text_elements(
            dataset_name, 10**6, remove_duplicates=False)['results']
        self.assertListEqual(all_elements, all_elements2)
        all_without_dups = data_access.sample_text_elements(
            dataset_name, 10**6, remove_duplicates=True)['results']
        self.assertEqual(len(all_elements), len(all_without_dups) + 1)

        category = 'cat1'
        texts_and_labels_list = [(elem.uri, {
            category:
            Label(labels=LABEL_POSITIVE, metadata={})
        }) for elem in all_without_dups]
        # set labels without propagating to duplicates
        data_access.set_labels(workspace_id,
                               texts_and_labels_list,
                               propagate_to_duplicates=False)
        labels_count = data_access.get_label_counts(workspace_id, dataset_name,
                                                    category)
        self.assertEqual(labels_count[LABEL_POSITIVE], len(all_without_dups))
        # unset labels
        data_access.unset_labels(workspace_id, category,
                                 [elem.uri for elem in all_without_dups])
        labels_count = data_access.get_label_counts(workspace_id, dataset_name,
                                                    category)
        self.assertEqual(labels_count[LABEL_POSITIVE], 0)
        # set labels with propagating to duplicates
        data_access.set_labels(workspace_id,
                               texts_and_labels_list,
                               propagate_to_duplicates=True)
        labels_count = data_access.get_label_counts(workspace_id, dataset_name,
                                                    category)
        self.assertEqual(labels_count[LABEL_POSITIVE], len(all_elements))
        ds_loader.clear_all_saved_files(dataset_name)
Beispiel #14
0
def multi_category_to_single_category_multi_label(
        new_category_name: str,
        uri_categories_and_labels_map: dict,
        cat_subset=None):
    """
    performs the following transformation:
    given:
    cat1: true
    cat2: false
    cat3: false
    returns:
    new_cat_name: cat1
    """
    from lrtc_lib.orchestrator.orchestrator_api import LABEL_POSITIVE
    filtered_gold_labels = {}
    for uri, label_dict in uri_categories_and_labels_map.items():
        labels = frozenset([
            cat for i, cat in enumerate(label_dict)
            if LABEL_POSITIVE in label_dict[cat].labels
        ])
        filtered_gold_labels[uri] = {
            new_category_name: Label(labels=labels, metadata={})
        }
    return filtered_gold_labels
def add_labels_to_doc(doc: Document, category: str):
    sentences_and_labels = []
    for elem in doc.text_elements:
        labels = {category: Label(labels=LABEL_POSITIVE, metadata={})}
        sentences_and_labels.append((elem.uri, labels))
    return sentences_and_labels