def __init__(self, config): super(ClsProvider, self).__init__(config) # Setup encoding pipeline cleaner = text.clean_text() tokenizer = text.Tokenizer(sep=self.args["separator"]) # word_tokenizer = text.Tokenizer(sep=' ') self.input_col_name = self.args["input_col_name"] self.label_col_name = self.args["label_col_name"] enc = text.Encoder(update_vocab=True, no_special_chars=False) label_enc = text.Encoder(update_vocab=True, no_special_chars=True) self.encoder = {} self.encoder["inp"] = [U.chainer(funcs=[cleaner, tokenizer, enc])] self.encoder["label"] = [U.chainer(funcs=[label_enc])] self.encoder["pred"] = [U.chainer(funcs=[label_enc])] self.encoder["mask"] = [U.chainer(funcs=[lambda x: x])] self.encoder["out"] = self.encoder["mask"] self.encoder["meta"] = self.encoder["mask"] self.decoder = {} self.decoder["inp"] = [ U.chainer(funcs=[enc.decode, tokenizer.detokenize]) ] self.decoder["label"] = [U.chainer(funcs=[label_enc.decode])] self.decoder["pred"] = [U.chainer(funcs=[label_enc.decode])] self.decoder["mask"] = [U.chainer(funcs=[lambda x: x])] self.decoder["out"] = self.decoder["mask"] self.decoder["meta"] = self.decoder["mask"] # Process data raw_data = self.upload_data() processed = [self.__call__(d, list_input=True) for d in raw_data] enc.freeze() # d = self._create_splits(s) self.data_raw = raw_data self.data = tuple([self._to_batches(split) for split in processed]) self.sample_data = raw_data[0][1] self.logger.info(f"Vocab size: {len(enc.vocab)}") self.logger.debug(f"Vocab: {enc.vocab}") self.logger.debug( f"Sample data: \n Raw: {raw_data[0][1]} \n Encoded: {self(raw_data[0][1])}" ) config["processor"]["params"]["vocab_size"] = len( enc.vocab) # Needs changing, we might have multiple vocabs
def __init__(self, config): super(MultiLMPairProvider, self).__init__(config) # Setup encoding pipeline cleaner = text.clean_text() char_tokenizer = text.Tokenizer(sep="") enc = text.Encoder(update_vocab=True, no_special_chars=False) sent_enc = U.chainer(funcs=[cleaner, char_tokenizer, enc]) sent_dec = U.chainer(funcs=[enc.decode, char_tokenizer.detokenize]) label_enc = text.Encoder(update_vocab=True, no_special_chars=True) as_is = U.chainer(funcs=[lambda x: x]) self.encoder = {} self.encoder["inp"] = [sent_enc, sent_enc] self.encoder["label"] = [sent_enc, U.chainer(funcs=[label_enc])] self.encoder["pred"] = self.encoder["inp"] self.encoder["mask"] = [as_is, as_is] self.encoder["out"] = as_is self.encoder["meta"] = as_is self.decoder = {} self.decoder["inp"] = [sent_dec, sent_dec] self.decoder["label"] = [sent_dec, U.chainer(funcs=[label_enc.decode])] self.decoder["pred"] = [as_is, label_enc.decode] self.decoder["mask"] = [as_is, as_is] self.decoder["out"] = [as_is, as_is] self.decoder["meta"] = as_is # Process data raw_data = self.upload_data() s = [self.__call__(d, list_input=True) for d in raw_data] enc.freeze() # d = self._create_splits(s) self.data_raw = raw_data self.logger.info("Sample Raw Example:") self.logger.info(self.data_raw[0]) self.data = tuple([self._to_batches(split) for split in s]) self.sample_data = raw_data[0][1] config["processor"]["params"]["vocab_size"] = len( enc.vocab) # Needs changing, we might have multiple vocabs
def test_chainer(): tokenizer = text.Tokenizer(sep=" ") enc = text.Encoder(update_vocab=True) chain = utils.chainer( funcs=[tokenizer, enc, enc.decode, tokenizer.detokenize]) inp = "مرحبا هنا" assert inp == chain(inp, list_input=False) inp = ["hi_there man_", "how are you?"] assert inp == chain(inp, list_input=True)
def __init__(self, seperator, max_vocab_size, min_vocab_count): cleaner = text.clean_text() tokenizer = text.Tokenizer(sep=seperator) enc = text.Encoder( update_vocab=True, no_special_chars=False, max_vocab_size=max_vocab_size, min_vocab_count=min_vocab_count, ) self.enc = enc self.encoder = U.chainer(funcs=[cleaner, tokenizer, enc]) self.decoder = U.chainer(funcs=[enc.decode, tokenizer.detokenize]) self.num_classes = enc.vocab
def test_encoder_freezing(): enc = text.Encoder() sen1 = "This is a test sent" sen2 = "This is a test sent!?" # Has 2 unseen characters _ = enc(sen1) len_before = len(enc.get_vocab()) print(len_before) enc.freeze() _ = enc(sen2) assert len_before == len(enc.get_vocab()) enc.unfreeze() _ = enc(sen2) print(len(enc.get_vocab())) assert len_before == len(enc.get_vocab()) - 2
def __init__(self): class_enc = text.Encoder(update_vocab=True, no_special_chars=True) self.enc = class_enc self.encoder = U.chainer(funcs=[class_enc]) self.decoder = U.chainer(funcs=[class_enc.decode])