Ejemplo n.º 1
0
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False):
    """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points"""
    data = import_data(data_filename)
    encodings = []
    equivalence_sets = []

    for name, code in data.items():
        idx = len(encodings)
        enc = encoder.get_encoding(code['original'])
        assert not np.isnan(np.sum(enc))
        encodings.append(enc)
        for noisy_sample in code['noise']:
            enc = encoder.get_encoding(noisy_sample)
            assert not np.isnan(np.sum(enc))
            encodings.append(enc)
        equivalence_sets.append(set(range(idx, len(encodings))))

    encodings = np.array(encodings)

    all_distances = squareform(pdist(encodings, 'cosine'))  # TODO: avoid square form somehow
    assert not np.any(np.isnan(all_distances))

    # Average the lower triangle of all_distances
    avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2)

    sum_distance_within_eq_class = 0.
    num_pairs = 0
    for equiv_class_idxs in equivalence_sets:
        num_elements_in_class = len(equiv_class_idxs)
        if num_elements_in_class < 2:
            continue
        elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32)
        sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1))
        num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2

    avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs
    if print_stats:
        print(
            "Within Avg Dist: %s  All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points))
    return avg_distance_between_all_points / avg_distance_within_eq_class
Ejemplo n.º 2
0
def evaluate_on_all_dims(encoder_filename: str, full_dataset_filename,
                         test_datsets_fileprefix) -> dict:
    """Return a dict with all results from comparison"""
    encoder = AbstractEncoder.load(encoder_filename)

    testset_filename = test_datsets_fileprefix + '-testset.json.gz'
    assert os.path.exists(testset_filename)

    neweq_testset_filename = test_datsets_fileprefix + '-neweqtestset.json.gz'
    assert os.path.exists(neweq_testset_filename)

    results = {}
    results['testintradist'] = get_representation_distance_ratio(
        encoder, testset_filename)
    results['neweqintradist'] = get_representation_distance_ratio(
        encoder, neweq_testset_filename)

    nn_evaluator = SemanticEquivalentDistanceEvaluation(None, encoder)

    test_nn_all_stats = nn_evaluator.evaluate_with_test(full_dataset_filename,
                                                        testset_filename,
                                                        num_nns=15)
    test_nn_within_stats = nn_evaluator.evaluate_with_test(testset_filename,
                                                           testset_filename,
                                                           num_nns=15)

    neweq_nn_all_stats = nn_evaluator.evaluate_with_test(
        full_dataset_filename, neweq_testset_filename, num_nns=15)
    neweq_nn_within_stats = nn_evaluator.evaluate_with_test(
        neweq_testset_filename, neweq_testset_filename, num_nns=15)

    for i in range(15):
        results['testsetknn' + str(i + 1) + 'all'] = test_nn_all_stats[i]
        results['testsetknn' + str(i + 1) + 'within'] = test_nn_within_stats[i]
        results['neweqknn' + str(i + 1) + 'all'] = neweq_nn_all_stats[i]
        results['neweqknn' + str(i + 1) + 'within'] = neweq_nn_within_stats[i]

    return results
Ejemplo n.º 3
0
        dataset_samples.append(
            (''.join(code['original'][0]), code['original'][1]))
        for noisy_sample in code['noise']:
            dataset_samples.append((''.join(noisy_sample[0]), noisy_sample[1]))
    return set(dataset_samples)


if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage <encoderPkl> <dataset.json.gz> <testset.json.gz>")
        sys.exit(-1)

    testset_samples = get_dataset_samples(sys.argv[3])

    data = import_data(sys.argv[2])
    encoder = AbstractEncoder.load(sys.argv[1])

    expression_data, encodings = [], []
    eq_class_idx_to_names = {}
    eq_class_counts = defaultdict(int)

    def add_sample(data, eq_class_idx: int):
        sample_data = dict(tree=data[1], eq_class=eq_class_idx)
        expression_data.append(sample_data)

        representation = encoder.get_encoding(data)
        assert not np.isnan(np.sum(representation))
        encodings.append(representation)

    for eq_class_idx, (name, code) in enumerate(data.items()):
        eq_class_idx_to_names[eq_class_idx] = name
Ejemplo n.º 4
0
 def __init__(self, encoder_filename: str, encoder: AbstractEncoder = None):
     if encoder is None:
         self.__encoder = AbstractEncoder.load(encoder_filename)
     else:
         self.__encoder = encoder
Ejemplo n.º 5
0
 def save(self, filename: str):
     tmp, self.__compiled_methods = self.__compiled_methods, None
     AbstractEncoder.save(self, filename)
     self.__compiled_methods = tmp
Ejemplo n.º 6
0
def get_encoder():
    return AbstractEncoder.load(ENCODER_PKL)