def test_should_create_same_pairs(number_translation_features_dict): pairs, _ = create_same_pairs(number_translation_features_dict, MIN_PAIRS_NUM, DATASET_SPEC) assert len(pairs) == MIN_PAIRS_NUM for left, right in pairs: assert left.number == right.number
def test_should_create_correct_number_of_same_pair_classes( number_translation_features_dict, min_pairs_num): pairs, _ = create_same_pairs(number_translation_features_dict, min_pairs_num, DATASET_SPEC) expected_frequency = min_pairs_num // len( number_translation_features_dict.keys()) counter = collections.Counter([(x[0].number, x[1].number) for x in pairs]) for freq in counter.values(): assert freq >= expected_frequency
def test_should_create_same_pairs_without_repeating_pairs( number_translation_features_dict): pairs, _ = create_same_pairs(number_translation_features_dict, MIN_PAIRS_NUM, gen.dataset_spec(repeating_pairs=False)) seen = set() for left, right in pairs: assert (left.trans, right.trans) not in seen seen.add((left.trans, right.trans))
def test_should_create_correct_pair_number(number_translation_features_dict, min_pairs_num, actual_pair_num): pairs, same_labels = create_same_pairs(number_translation_features_dict, min_pairs_num, DATASET_SPEC) assert len(pairs) == len(same_labels) == actual_pair_num
def _create_paired_data( examples: np.ndarray, labels: np.ndarray, dataset_spec: DatasetSpec, size: Optional[int] = None ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]: if dataset_spec.with_excludes: keys_to_drop = [] else: keys_to_drop = config[consts.EXCLUDED_KEYS] zipped = zip(examples, labels) features_dict = collections.defaultdict(list) for x, y in zipped: features_dict[y].append(x) utils.log("Creating paired data excluding keys: " + str(keys_to_drop)) try: [features_dict.pop(key) for key in keys_to_drop] except KeyError as e: utils.log("Key to exclude not found in dataset: {}".format(e)) if size: pairs_num = size // 2 else: pairs_num = len(examples) // 2 same_pairs: List[Tuple[ndarray, ndarray]] same_pairs, same_labels = generating_pairs.create_same_pairs( features_dict, pairs_num, dataset_spec) diff_pairs: List[Tuple[ndarray, ndarray]] left_labels: List[int] right_labels: List[int] diff_pairs, left_labels, right_labels = generating_pairs.create_different_pairs( features_dict, pairs_num) diff_one_hot_labels = [[0]] * len(diff_pairs) same_one_hot_labels = [[1]] * len(same_pairs) all_pairs: ndarray = np.vstack((same_pairs, diff_pairs)) left_digit_labels: ndarray = np.hstack((same_labels, left_labels)) right_digit_labels: ndarray = np.hstack((same_labels, right_labels)) pair_labels: ndarray = np.vstack( (same_one_hot_labels, diff_one_hot_labels)) all_pairs, pair_labels, all_left_labels, all_right_labels = unison_shuffle( all_pairs, pair_labels, left_digit_labels, right_digit_labels) left_pairs, right_pairs = zip(*all_pairs) features_dict = collections.OrderedDict({ consts.LEFT_FEATURE_IMAGE: np.array(left_pairs), consts.RIGHT_FEATURE_IMAGE: np.array(right_pairs) }) labels_dict = collections.OrderedDict({ consts.PAIR_LABEL: np.array(pair_labels.sum(axis=1)), consts.LEFT_FEATURE_LABEL: np.array(all_left_labels), consts.RIGHT_FEATURE_LABEL: np.array(all_right_labels), }) return features_dict, labels_dict