def test_should_create_same_pairs(number_translation_features_dict):
    pairs, _ = create_same_pairs(number_translation_features_dict,
                                 MIN_PAIRS_NUM, DATASET_SPEC)
    assert len(pairs) == MIN_PAIRS_NUM

    for left, right in pairs:
        assert left.number == right.number
def test_should_create_correct_number_of_same_pair_classes(
        number_translation_features_dict, min_pairs_num):
    pairs, _ = create_same_pairs(number_translation_features_dict,
                                 min_pairs_num, DATASET_SPEC)
    expected_frequency = min_pairs_num // len(
        number_translation_features_dict.keys())
    counter = collections.Counter([(x[0].number, x[1].number) for x in pairs])

    for freq in counter.values():
        assert freq >= expected_frequency
def test_should_create_same_pairs_without_repeating_pairs(
        number_translation_features_dict):
    pairs, _ = create_same_pairs(number_translation_features_dict,
                                 MIN_PAIRS_NUM,
                                 gen.dataset_spec(repeating_pairs=False))

    seen = set()
    for left, right in pairs:
        assert (left.trans, right.trans) not in seen
        seen.add((left.trans, right.trans))
def test_should_create_correct_pair_number(number_translation_features_dict,
                                           min_pairs_num, actual_pair_num):
    pairs, same_labels = create_same_pairs(number_translation_features_dict,
                                           min_pairs_num, DATASET_SPEC)

    assert len(pairs) == len(same_labels) == actual_pair_num
Exemple #5
0
def _create_paired_data(
    examples: np.ndarray,
    labels: np.ndarray,
    dataset_spec: DatasetSpec,
    size: Optional[int] = None
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
    if dataset_spec.with_excludes:
        keys_to_drop = []
    else:
        keys_to_drop = config[consts.EXCLUDED_KEYS]

    zipped = zip(examples, labels)
    features_dict = collections.defaultdict(list)
    for x, y in zipped:
        features_dict[y].append(x)

    utils.log("Creating paired data excluding keys: " + str(keys_to_drop))
    try:
        [features_dict.pop(key) for key in keys_to_drop]
    except KeyError as e:
        utils.log("Key to exclude not found in dataset: {}".format(e))

    if size:
        pairs_num = size // 2
    else:
        pairs_num = len(examples) // 2
    same_pairs: List[Tuple[ndarray, ndarray]]
    same_pairs, same_labels = generating_pairs.create_same_pairs(
        features_dict, pairs_num, dataset_spec)
    diff_pairs: List[Tuple[ndarray, ndarray]]
    left_labels: List[int]
    right_labels: List[int]
    diff_pairs, left_labels, right_labels = generating_pairs.create_different_pairs(
        features_dict, pairs_num)

    diff_one_hot_labels = [[0]] * len(diff_pairs)
    same_one_hot_labels = [[1]] * len(same_pairs)

    all_pairs: ndarray = np.vstack((same_pairs, diff_pairs))
    left_digit_labels: ndarray = np.hstack((same_labels, left_labels))
    right_digit_labels: ndarray = np.hstack((same_labels, right_labels))
    pair_labels: ndarray = np.vstack(
        (same_one_hot_labels, diff_one_hot_labels))

    all_pairs, pair_labels, all_left_labels, all_right_labels = unison_shuffle(
        all_pairs, pair_labels, left_digit_labels, right_digit_labels)

    left_pairs, right_pairs = zip(*all_pairs)

    features_dict = collections.OrderedDict({
        consts.LEFT_FEATURE_IMAGE:
        np.array(left_pairs),
        consts.RIGHT_FEATURE_IMAGE:
        np.array(right_pairs)
    })
    labels_dict = collections.OrderedDict({
        consts.PAIR_LABEL:
        np.array(pair_labels.sum(axis=1)),
        consts.LEFT_FEATURE_LABEL:
        np.array(all_left_labels),
        consts.RIGHT_FEATURE_LABEL:
        np.array(all_right_labels),
    })

    return features_dict, labels_dict