def create_shuffle_stratified_splits(dataset,
                                     n_splits,
                                     train_size_ratio=1.0,
                                     drop_entities=False,
                                     seed=None):
    if train_size_ratio > 1.0 or train_size_ratio < 0:
        raise ValueError("Invalid value for train size ratio: %s" %
                         train_size_ratio)

    nb_utterances = {
        intent: len(data[UTTERANCES])
        for intent, data in dataset[INTENTS].items()
    }
    total_utterances = sum(nb_utterances.values())
    if total_utterances < n_splits:
        raise NotEnoughDataError("Number of utterances is too low (%s)" %
                                 total_utterances)
    if drop_entities:
        dataset = deepcopy(dataset)
        for entity, data in iteritems(dataset[ENTITIES]):
            data[DATA] = []
    else:
        dataset = update_entities_with_utterances(dataset)

    utterances = np.array([
        (intent_name, utterance)
        for intent_name, intent_data in dataset[INTENTS].items()
        for utterance in intent_data[UTTERANCES]
    ])
    intents = np.array([u[0] for u in utterances])
    X = np.zeros(len(intents))
    random_state = check_random_state(seed)
    sss = StratifiedKFold(n_splits=n_splits,
                          shuffle=True,
                          random_state=random_state)
    splits = []
    try:
        for train_index, test_index in sss.split(X, intents):
            train_utterances = utterances[train_index].tolist()
            train_utterances = get_utterances_subset(train_utterances,
                                                     train_size_ratio)
            test_utterances = utterances[test_index].tolist()

            if len(train_utterances) == 0:
                not_enough_data(n_splits, train_size_ratio)
            train_dataset = deepcopy(dataset)
            train_dataset[INTENTS] = dict()
            for intent_name, utterance in train_utterances:
                if intent_name not in train_dataset[INTENTS]:
                    train_dataset[INTENTS][intent_name] = {UTTERANCES: []}
                train_dataset[INTENTS][intent_name][UTTERANCES].append(
                    deepcopy(utterance))
            splits.append((train_dataset, test_utterances))
    except ValueError:
        not_enough_data(n_splits, train_size_ratio)
    return splits
Esempio n. 2
0
    def test_get_utterances_subset_should_work(self):
        # Given
        utterances = [
            ("intent1", {
                "data": [{
                    "text": "text1"
                }]
            }),
            ("intent1", {
                "data": [{
                    "text": "text2"
                }]
            }),
            ("intent1", {
                "data": [{
                    "text": "text3"
                }]
            }),
            ("intent1", {
                "data": [{
                    "text": "text4"
                }]
            }),
            ("intent2", {
                "data": [{
                    "text": "text1"
                }]
            }),
            ("intent2", {
                "data": [{
                    "text": "text2"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text1"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text2"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text3"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text4"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text5"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text6"
                }]
            }),
        ]

        # When
        utterances_subset = get_utterances_subset(utterances, ratio=0.5)
        utterances_subset = sorted(utterances_subset,
                                   key=lambda u: "%s%s" %
                                   (u[0], u[1]["data"][0]["text"]))

        # Then
        expected_utterances = [
            ("intent1", {
                "data": [{
                    "text": "text1"
                }]
            }),
            ("intent1", {
                "data": [{
                    "text": "text2"
                }]
            }),
            ("intent2", {
                "data": [{
                    "text": "text1"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text1"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text2"
                }]
            }),
            ("intent3", {
                "data": [{
                    "text": "text3"
                }]
            }),
        ]
        self.assertListEqual(expected_utterances, utterances_subset)
Esempio n. 3
0
def create_shuffle_stratified_splits(
    dataset,
    n_splits,
    train_size_ratio=1.0,
    drop_entities=False,
    seed=None,
    out_of_domain_utterances=None,
    intents_filter=None,
):
    if train_size_ratio > 1.0 or train_size_ratio < 0:
        error_msg = "Invalid value for train size ratio: %s" % train_size_ratio
        logger.exception(error_msg)
        raise ValueError(error_msg)

    nb_utterances = {
        intent: len(data[UTTERANCES])
        for intent, data in dataset[INTENTS].items()
    }
    if any(
        (nb * train_size_ratio < n_splits for nb in nb_utterances.values())):
        raise NotEnoughDataError(dataset, n_splits, train_size_ratio)

    if drop_entities:
        dataset = deepcopy(dataset)
        for entity, data in dataset[ENTITIES].items():
            data[DATA] = []
    else:
        dataset = update_entities_with_utterances(dataset)

    utterances = np.array([
        (intent_name, utterance)
        for intent_name, intent_data in dataset[INTENTS].items()
        for utterance in intent_data[UTTERANCES]
    ])
    intents = np.array([u[0] for u in utterances])
    X = np.zeros(len(intents))
    random_state = check_random_state(seed)
    sss = StratifiedKFold(n_splits=n_splits,
                          shuffle=True,
                          random_state=random_state)
    splits = []
    for train_index, test_index in sss.split(X, intents):
        train_utterances = utterances[train_index].tolist()
        train_utterances = get_utterances_subset(train_utterances,
                                                 train_size_ratio)
        test_utterances = utterances[test_index].tolist()
        train_dataset = deepcopy(dataset)
        train_dataset[INTENTS] = dict()
        for intent_name, utterance in train_utterances:
            if intent_name not in train_dataset[INTENTS]:
                train_dataset[INTENTS][intent_name] = {UTTERANCES: []}
            train_dataset[INTENTS][intent_name][UTTERANCES].append(
                deepcopy(utterance))
        splits.append((train_dataset, test_utterances))

    if intents_filter is not None:
        filtered_splits = []
        for train_dataset, test_utterances in splits:
            test_utterances = [(intent_name, utterance)
                               for intent_name, utterance in test_utterances
                               if intent_name in intents_filter]
            filtered_splits.append((train_dataset, test_utterances))
        splits = filtered_splits

    if out_of_domain_utterances is not None:
        additional_test_utterances = [[
            NONE_INTENT_NAME, {
                DATA: [{
                    TEXT: utterance
                }]
            }
        ] for utterance in out_of_domain_utterances]
        for split in splits:
            split[1].extend(additional_test_utterances)

    return splits