def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False): """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points""" data = import_data(data_filename) encodings = [] equivalence_sets = [] for name, code in data.items(): idx = len(encodings) enc = encoder.get_encoding(code['original']) assert not np.isnan(np.sum(enc)) encodings.append(enc) for noisy_sample in code['noise']: enc = encoder.get_encoding(noisy_sample) assert not np.isnan(np.sum(enc)) encodings.append(enc) equivalence_sets.append(set(range(idx, len(encodings)))) encodings = np.array(encodings) all_distances = squareform(pdist(encodings, 'cosine')) # TODO: avoid square form somehow assert not np.any(np.isnan(all_distances)) # Average the lower triangle of all_distances avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2) sum_distance_within_eq_class = 0. num_pairs = 0 for equiv_class_idxs in equivalence_sets: num_elements_in_class = len(equiv_class_idxs) if num_elements_in_class < 2: continue elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32) sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1)) num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2 avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs if print_stats: print( "Within Avg Dist: %s All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points)) return avg_distance_between_all_points / avg_distance_within_eq_class
def evaluate_on_all_dims(encoder_filename: str, full_dataset_filename, test_datsets_fileprefix) -> dict: """Return a dict with all results from comparison""" encoder = AbstractEncoder.load(encoder_filename) testset_filename = test_datsets_fileprefix + '-testset.json.gz' assert os.path.exists(testset_filename) neweq_testset_filename = test_datsets_fileprefix + '-neweqtestset.json.gz' assert os.path.exists(neweq_testset_filename) results = {} results['testintradist'] = get_representation_distance_ratio( encoder, testset_filename) results['neweqintradist'] = get_representation_distance_ratio( encoder, neweq_testset_filename) nn_evaluator = SemanticEquivalentDistanceEvaluation(None, encoder) test_nn_all_stats = nn_evaluator.evaluate_with_test(full_dataset_filename, testset_filename, num_nns=15) test_nn_within_stats = nn_evaluator.evaluate_with_test(testset_filename, testset_filename, num_nns=15) neweq_nn_all_stats = nn_evaluator.evaluate_with_test( full_dataset_filename, neweq_testset_filename, num_nns=15) neweq_nn_within_stats = nn_evaluator.evaluate_with_test( neweq_testset_filename, neweq_testset_filename, num_nns=15) for i in range(15): results['testsetknn' + str(i + 1) + 'all'] = test_nn_all_stats[i] results['testsetknn' + str(i + 1) + 'within'] = test_nn_within_stats[i] results['neweqknn' + str(i + 1) + 'all'] = neweq_nn_all_stats[i] results['neweqknn' + str(i + 1) + 'within'] = neweq_nn_within_stats[i] return results
dataset_samples.append( (''.join(code['original'][0]), code['original'][1])) for noisy_sample in code['noise']: dataset_samples.append((''.join(noisy_sample[0]), noisy_sample[1])) return set(dataset_samples) if __name__ == '__main__': if len(sys.argv) != 4: print("Usage <encoderPkl> <dataset.json.gz> <testset.json.gz>") sys.exit(-1) testset_samples = get_dataset_samples(sys.argv[3]) data = import_data(sys.argv[2]) encoder = AbstractEncoder.load(sys.argv[1]) expression_data, encodings = [], [] eq_class_idx_to_names = {} eq_class_counts = defaultdict(int) def add_sample(data, eq_class_idx: int): sample_data = dict(tree=data[1], eq_class=eq_class_idx) expression_data.append(sample_data) representation = encoder.get_encoding(data) assert not np.isnan(np.sum(representation)) encodings.append(representation) for eq_class_idx, (name, code) in enumerate(data.items()): eq_class_idx_to_names[eq_class_idx] = name
def __init__(self, encoder_filename: str, encoder: AbstractEncoder = None): if encoder is None: self.__encoder = AbstractEncoder.load(encoder_filename) else: self.__encoder = encoder
def save(self, filename: str): tmp, self.__compiled_methods = self.__compiled_methods, None AbstractEncoder.save(self, filename) self.__compiled_methods = tmp
def get_encoder(): return AbstractEncoder.load(ENCODER_PKL)