def load_col_chars( char_enc, col_chars_path, blank_repr_div=4, unknown_char_extra_neg=False, ): with h5py.File(col_chars_path, 'r') as hf5: nominal_enc = NominalDataEncoder( [char_enc.encoder[key.rpartition('_')[-1]] for key in hf5.keys()]) labels_repr = [torch.tensor(hf5[dat][:]) for dat in hf5.keys()] blank_mevm_idx = nominal_enc.encoder[char_enc.encoder['~']] if blank_repr_div is not None: labels_repr[blank_mevm_idx] = labels_repr[blank_mevm_idx][:int( len(labels_repr[blank_mevm_idx]) / blank_repr_div)] # Handle unknown character as extra_negatives if (char_enc.unknown_idx in nominal_enc.encoder and unknown_char_extra_neg): unknown_mevm_idx = nominal_enc.encoder[char_enc.encoder['#']] extra_negatives = labels_repr[unknown_mevm_idx] # Given unknown char is treated as rest of unknowns, remove so # MEVM do not treat it as a known class. nominal_enc.pop(unknown_mevm_idx) else: extra_negatives = None return nominal_enc, labels_repr, extra_negatives
def organize_data_pts_by_logits(argmax_logits, layers): # Organize the layers into lists per character class. unique_labels, label_counts = np.unique(argmax_logits, return_counts=True) logging.debug('Number of Unique Labels: %d', len(unique_labels)) logging.debug('The unique labels: %s', unique_labels) logging.debug('Label counts: %s', label_counts) # Be able to obtain the label from the MEVM's indexing of classes nominal_encoder = NominalDataEncoder(unique_labels) labels_repr = [] logging.info('Unique Labels contained within layer encoding:') for i, label in enumerate(unique_labels): logging.info('%d : %d', label, label_counts[i]) label_indices = np.where(argmax_logits == label)[0] labels_repr.append(torch.tensor(layers[label_indices])) logging.debug('Label `%s`\'s indices = %s', label, label_indices) logging.debug( 'Torch tensor shape of label %s = %s', label, labels_repr[i].shape, ) return labels_repr, nominal_encoder
def load(filepath, blank_idx, space_char, unknown_idx): """Loads the label set and creates char encoder""" nde = NominalDataEncoder.load(filepath) # TODO build CharEncoder s.t. it can simply copy the parts of the given # NDE return CharEncoder(blank, space_char, unknown_idx, list(nde.encoder))
def __init__(self, labels=None, max_unknown=None, *args, **kwargs): super(MEVM, self).__init__(*args, **kwargs) # Create a NominalDataEncoder to map class inputs to the MEVM internal # class represntation. if isinstance(labels, NominalDataEncoder) or labels is None: self.label_enc = labels elif isinstance(labels, list) or isinstance(labels, np.ndarray): self.label_enc = NominalDataEncoder(labels) else: raise TypeError(' '.join([ 'Expected `labels` of types: None, list, np.ndarray, or', 'NominalDataEncoder, not of type {type(labels)}' ])) self.max_unknown = max_unknown
def __init__(self, *args, **kwargs): self.label_enc = NominalDataEncoder(*args, **kwargs)
def load(h5, labels=None, labels_dtype=None, train_hyperparams=None): """Performs the same lod functionality as in MultipleEVM but loads the ordered labels from the h5 file for the label encoder. """ if isinstance(h5, str): h5 = h5py.File(h5, 'r') # load evms _evms = [] i = 1 while "EVM-%d" % i in h5: _evms.append(EVM(h5["EVM-%d" % (i)], log_level='debug')) i += 1 # Load the ordered label into the NominalDataEncoder if 'labels' in h5.keys(): if labels is not None: logging.info(' '.join([ '`labels` key exists in the HDF5 MEVM state file, but', 'labels was given explicitly to MEVM.load(). Ignoring the', 'labels in the HDF5 file.', ])) label_enc = NominalDataEncoder(labels) else: if labels_dtype is None: labels_dtype = np.dtype(h5.attrs['labels_dtype']) label_enc = NominalDataEncoder( h5['labels'][:].astype(labels_dtype), ) elif labels is not None: label_enc = NominalDataEncoder(labels) else: logging.warning(' '.join([ 'No `labels` dataset available in given hdf5. Relying on the', 'evm model\'s labels if they exist. Will fail if the MEVM', 'state does not have any labels in each of its EVM.', ])) label_enc = NominalDataEncoder([evm.label for evm in _evms], ) # Load training vars if not given if train_hyperparams is None: # NOTE Able to specify which to load from h5 by passing a list. train_hyperparams = [ 'tailsize', 'cover_threshold', 'distance_function', 'distance_multiplier', 'max_unknown', ] if isinstance(train_hyperparams, list): train_hyperparams = { attr: h5.attrs[attr] for attr in train_hyperparams if attr in h5.attrs } elif not isinstance(train_hyperparams, dict): raise TypeError(' '.join([ '`train_hyperparams` expected type: None, list, or dict, but', f'recieved {type(train_hyperparams)}', ])) mevm = MEVM(label_enc, **train_hyperparams) mevm._evms = _evms return mevm
def fit(self, points, labels=None, extra_negatives=None): """Wraps the MultipleEVM's train() and uses the encoder to """ # If points and labels are aligned sequence pair (X, y): adjust form if (isinstance(points, np.ndarray) and (isinstance(labels, list) or isinstance(labels, np.ndarray)) and len(points) == len(labels)): # Adjust sequence pair into list of torch.Tensors and unique labels unique = np.unique(labels) labels = np.array(labels) points = [torch.Tensor(points[labels == u]) for u in unique] labels = unique elif isinstance(points, list): if all([isinstance(pts, np.ndarray) for pts in points]): # If list of np.ndarrays, turn into torch.Tensors points = [torch.Tensor(pts) for pts in points] elif not all([isinstance(pts, torch.Tensor) for pts in points]): raise TypeError(' '.join([ 'expected points to be of types: list(np.ndarray),', 'list(torch.tensor), or np.ndarray with labels as an', 'aligned list or np.ndarray', ])) else: raise TypeError(' '.join([ 'expected points to be of types: list(np.ndarray),', 'list(torch.tensor), or np.ndarray with labels as an', 'aligned list or np.ndarray', ])) # Set encoder if labels is not None if labels is not None: if len(points) != len(labels): raise ValueError(' '.join([ 'The given number of labels does not equal the number of', 'classes represented by the list of points.', 'If giving an aligned sequence pair of points and labels,', 'then ensure `points` is of type `np.ndarray`.', ])) if self.label_enc is not None: logging.debug( '`encoder` is not None and is being overwritten!', ) if isinstance(labels, NominalDataEncoder): self.label_enc = labels elif isinstance(labels, list) or isinstance(labels, np.ndarray): self.label_enc = NominalDataEncoder(labels) else: raise TypeError(' '.join([ 'Expected `labels` of types: None, list, np.ndarray, or', 'NominalDataEncoder, not of type {type(labels)}' ])) # Ensure extra_negatives is of expected form (no labels for these) if ((isinstance(extra_negatives, np.ndarray) and len(extra_negatives.shape) == 2) or isinstance(extra_negatives, list)): extra_negatives = torch.Tensor(extra_negatives) elif not isinstance(extra_negatives, torch.Tensor): raise TypeError(' '.join([ 'The extra_negatives must be either None, torch.Tensor of', 'shape 2, or an object broadcastable to such a torch.Tensor.', ])) # Points is now list(torch.Tensors) and encoder handled. # TODO handle adjust of extra negatives as a list of labels to be known # unknowns. For now, expects extra_negatives always of correct type. self.train(points, labels, extra_negatives)