Ejemplo n.º 1
0
def hierarchical_clustering_testing(encoder_model, data_path, batch_size, device, distance):
    # load data
    strings, similarities = load_hc_data(data_path)
    strings = torch.from_numpy(strings).long()
    print("Hierarchical", strings.shape)
    strings = index_to_one_hot(strings)
    strings_loader = torch.utils.data.DataLoader(strings, batch_size=batch_size, shuffle=False)

    # embed sequences and compute distance matrix
    embedded_strings = embed_strings(strings_loader, encoder_model, device)
    estimate_distances = DISTANCE_MATRIX[distance](embedded_strings, embedded_strings, encoder_model.scaling)

    # fix the problems caused by floating point arithmetic: it must be symmetric and with diagonal 0
    estimate_distances = (estimate_distances + estimate_distances.T)/2
    ind = np.diag_indices(estimate_distances.shape[0])
    estimate_distances[ind[0], ind[1]] = 0.0

    # run agglomerative clustering algorithms
    metrics = {}
    for method in ["single", "complete", "average", "ward"]:
        metrics[method] = {}
        baseline_tree = to_nx_tree(linkage(squareform(estimate_distances), method))
        dc = dasgupta_cost(baseline_tree, similarities)
        metrics[method]["DC"] = dc
    print(metrics)
Ejemplo n.º 2
0
    def __init__(self, sequences, distances):
        self.len_sequence = sequences.shape[-1]
        self.sequences = index_to_one_hot(sequences)
        self.distances = distances
        self.N_sequences = sequences.shape[0]

        # Normalise labels
        self.normalisation_constant = self.sequences.shape[-2]
        self.distances = self.distances / self.normalisation_constant
Ejemplo n.º 3
0
    def __init__(self, sequences, distances, multiplicity=1):
        # multiplicity indicates (1/2) the number of times a string is sampled at every epoch

        self.len_sequence = sequences.shape[-1]
        self.sequences = index_to_one_hot(sequences)
        self.distances = distances
        self.N_batches = self.sequences.shape[0]
        self.batch_size = self.sequences.shape[1]
        self.multiplicity = multiplicity

        # Normalise labels
        self.normalisation_constant = self.sequences.shape[-2]
        self.distances = self.distances / self.normalisation_constant
Ejemplo n.º 4
0
    def __init__(self, sequences, distances):
        self.len_sequence = sequences.shape[-1]

        self.sequences = index_to_one_hot(sequences)
        self.distances = distances
        self.N_batches = self.sequences.shape[0]
        self.batch_size = self.sequences.shape[1]

        # Normalise labels
        self.normalisation_constant = self.sequences.shape[-2]
        self.distances = [
            d / (self.normalisation_constant * 2**p)
            for p, d in enumerate(self.distances)
        ]
Ejemplo n.º 5
0
 def forward(self, sequence):
     (B, N) = sequence.shape
     sequence = index_to_one_hot(sequence, device=self.device)
     embedding = self.mlp(sequence.reshape(B, -1))
     return embedding
Ejemplo n.º 6
0
 def __init__(self, sequences):
     self.sequences = index_to_one_hot(sequences)
Ejemplo n.º 7
0
 def __init__(self, sequences, labels):
     self.sequences = index_to_one_hot(sequences)
     self.labels = labels