def encoder(self, data): if type(data) == list: self._encoder = OneToOne(enumerate(data)).inv elif type(data) in [dict, OrderedDict]: self._encoder = OneToOne(data) else: assert False
def __init__(self, labels=[], reserved_mappings={}, filepath=None): ''' Arguments: labels=[], list(strings): A list of potentially non-unique strings representing categorical labels reserved_mappings={}, dict({str:int}): a dictionary mapping of text to integer numbers filepath ''' self.num_classes = 0 self._encodings = OneToOne() if len(reserved_mappings) > 0: reserved_mappings = list(reserved_mappings) self.merge_labels(reserved_mappings) if len(labels) > 0: self.merge_labels(labels) if filepath is not None: # if len(self)>0: self.merge_labels(self.load_labels(filepath))
class ParameterKeyEscaper: """ Makes the fields name ready for use with MongoDB and Mongoengine . and $ are replaced with their codes __ and leading _ are escaped Since % is used as an escape character the % is also escaped """ _mapping = OneToOne({".": "%2E", "$": "%24", "__": "%_%_"}) @classmethod def escape(cls, value): """ Quote a parameter key """ if value is None: raise errors.bad_request.ValidationError("Key cannot be empty") value = value.strip().replace("%", "%%") for c, r in cls._mapping.items(): value = value.replace(c, r) if value.startswith("_"): value = "%_" + value[1:] return value @classmethod def _unescape(cls, value): for c, r in cls._mapping.inv.items(): value = value.replace(c, r) return value @classmethod def unescape(cls, value): """ Unquote a quoted parameter key """ value = "%".join(map(cls._unescape, value.split("%%"))) if value.startswith("%_"): value = "_" + value[2:] return value
class ParameterKeyEscaper: _mapping = OneToOne({".": "%2E", "$": "%24"}) @classmethod def escape(cls, value): """ Quote a parameter key """ value = value.strip().replace("%", "%%") for c, r in cls._mapping.items(): value = value.replace(c, r) return value @classmethod def _unescape(cls, value): for c, r in cls._mapping.inv.items(): value = value.replace(c, r) return value @classmethod def unescape(cls, value): """ Unquote a quoted parameter key """ return "%".join(map(cls._unescape, value.split("%%")))
def test_one_to_one(): e = OneToOne({1:2}) def ck(val, inv): assert (e, e.inv) == (val, inv) ck({1:2}, {2:1}) e[2] = 3 ck({1:2, 2:3}, {3:2, 2:1}) e.clear() ck({}, {}) e[1] = 1 ck({1:1}, {1:1}) e[1] = 2 ck({1:2}, {2:1}) e[3] = 2 ck({3:2}, {2:3}) del e[3] ck({}, {}) e[1] = 2 e.inv[2] = 3 ck({3:2}, {2:3}) del e.inv[2] ck({}, {}) assert OneToOne({1:2, 3:4}).copy().inv == {2:1, 4:3} e[1] = 2 e.pop(1) ck({}, {}) e[1] = 2 e.inv.pop(2) ck({}, {}) e[1] = 2 e.popitem() ck({}, {}) e.setdefault(1) ck({1: None}, {None: 1}) e.inv.setdefault(2) ck({1: None, None: 2}, {None: 1, 2: None}) e.clear() e.update({1:2}, cat="dog") ck({1:2, "cat":"dog"}, {2:1, "dog":"cat"}) # try various overlapping values oto = OneToOne({'a': 0, 'b': 0}) assert len(oto) == len(oto.inv) == 1 oto['c'] = 0 assert len(oto) == len(oto.inv) == 1 assert oto.inv[0] == 'c' oto.update({'z': 0, 'y': 0}) assert len(oto) == len(oto.inv) == 1 # test out unique classmethod with pytest.raises(ValueError): OneToOne.unique({'a': 0, 'b': 0}) return
def test_one_to_one(): e = OneToOne({1:2}) def ck(val, inv): assert (e, e.inv) == (val, inv) ck({1:2}, {2:1}) e[2] = 3 ck({1:2, 2:3}, {3:2, 2:1}) e.clear() ck({}, {}) e[1] = 1 ck({1:1}, {1:1}) e[1] = 2 ck({1:2}, {2:1}) e[3] = 2 ck({3:2}, {2:3}) del e[3] ck({}, {}) e[1] = 2 e.inv[2] = 3 ck({3:2}, {2:3}) del e.inv[2] ck({}, {}) assert OneToOne({1:2, 3:4}).copy().inv == {2:1, 4:3} e[1] = 2 e.pop(1) ck({}, {}) e[1] = 2 e.inv.pop(2) ck({}, {}) e[1] = 2 e.popitem() ck({}, {}) e.setdefault(1) ck({1: None}, {None: 1}) e.inv.setdefault(2) ck({1: None, None: 2}, {None: 1, 2: None}) e.clear() e.update({1:2}, cat="dog") ck({1:2, "cat":"dog"}, {2:1, "dog":"cat"})
def __init__(self, labels): self.classes = tuple(np.unique(sorted(labels))) self._encoder = OneToOne(enumerate(self.classes)).inv self.fname = 'label_encoder.json'
class LabelEncoder: def __init__(self, labels=[], reserved_mappings={}, filepath=None): ''' Arguments: labels=[], list(strings): A list of potentially non-unique strings representing categorical labels reserved_mappings={}, dict({str:int}): a dictionary mapping of text to integer numbers filepath ''' self.num_classes = 0 self._encodings = OneToOne() if len(reserved_mappings) > 0: reserved_mappings = list(reserved_mappings) self.merge_labels(reserved_mappings) if len(labels) > 0: self.merge_labels(labels) if filepath is not None: # if len(self)>0: self.merge_labels(self.load_labels(filepath)) def filter(self, data_df, text_label_col='family', int_label_col=None): ''' Filter a dataframe to include only rows corresponding to labels in the encoder. Useful for preprocessing a target domain dataset for a model trained on source domain labels. ''' int_whitelist = list(self.get_encodings().inv) text_whitelist = list(self.get_encodings()) if int_label_col: data = data_df[data_df[int_label_col].isin(int_whitelist)] else: data = data_df[data_df[text_label_col].isin(text_whitelist)] return data def transform(self, labels): return [self._encodings[l] for l in list(labels)] def inv_transform(self, encoded_labels): return [self._encodings.inv[l] for l in list(encoded_labels)] def merge_labels(self, labels=[]): ''' Labels can be list, or a dict where the keys are str Iterates through labels or unique values that dont already exist in encoder. ''' labels = list(labels) for l in np.unique(labels): if l not in self._encodings.keys(): self._encodings.update({l: self.num_classes}) self.num_classes += 1 def load_labels(self, filepath): return load_label_encodings_from_file(filepath) def save_labels(self, filepath): save_label_encodings_to_file(self.get_encodings(), filepath) def get_encodings(self): return copy.deepcopy(self._encodings) def __len__(self): return len(self.get_encodings()) def __repr__(self): return json.dumps(self.get_encodings(), indent=2)