Ejemplo n.º 1
0
    def __init__(self, config):
        super(ClsProvider, self).__init__(config)
        # Setup encoding pipeline
        cleaner = text.clean_text()
        tokenizer = text.Tokenizer(sep=self.args["separator"])
        # word_tokenizer = text.Tokenizer(sep=' ')

        self.input_col_name = self.args["input_col_name"]
        self.label_col_name = self.args["label_col_name"]

        enc = text.Encoder(update_vocab=True, no_special_chars=False)
        label_enc = text.Encoder(update_vocab=True, no_special_chars=True)

        self.encoder = {}
        self.encoder["inp"] = [U.chainer(funcs=[cleaner, tokenizer, enc])]
        self.encoder["label"] = [U.chainer(funcs=[label_enc])]
        self.encoder["pred"] = [U.chainer(funcs=[label_enc])]
        self.encoder["mask"] = [U.chainer(funcs=[lambda x: x])]
        self.encoder["out"] = self.encoder["mask"]
        self.encoder["meta"] = self.encoder["mask"]

        self.decoder = {}
        self.decoder["inp"] = [
            U.chainer(funcs=[enc.decode, tokenizer.detokenize])
        ]
        self.decoder["label"] = [U.chainer(funcs=[label_enc.decode])]
        self.decoder["pred"] = [U.chainer(funcs=[label_enc.decode])]
        self.decoder["mask"] = [U.chainer(funcs=[lambda x: x])]
        self.decoder["out"] = self.decoder["mask"]
        self.decoder["meta"] = self.decoder["mask"]

        # Process data
        raw_data = self.upload_data()
        processed = [self.__call__(d, list_input=True) for d in raw_data]
        enc.freeze()
        # d = self._create_splits(s)
        self.data_raw = raw_data
        self.data = tuple([self._to_batches(split) for split in processed])

        self.sample_data = raw_data[0][1]
        self.logger.info(f"Vocab size: {len(enc.vocab)}")
        self.logger.debug(f"Vocab: {enc.vocab}")
        self.logger.debug(
            f"Sample data: \n Raw: {raw_data[0][1]} \n Encoded: {self(raw_data[0][1])}"
        )
        config["processor"]["params"]["vocab_size"] = len(
            enc.vocab)  # Needs changing, we might have multiple vocabs
Ejemplo n.º 2
0
    def __init__(self, config):
        super(MultiLMPairProvider, self).__init__(config)
        # Setup encoding pipeline
        cleaner = text.clean_text()
        char_tokenizer = text.Tokenizer(sep="")

        enc = text.Encoder(update_vocab=True, no_special_chars=False)
        sent_enc = U.chainer(funcs=[cleaner, char_tokenizer, enc])
        sent_dec = U.chainer(funcs=[enc.decode, char_tokenizer.detokenize])
        label_enc = text.Encoder(update_vocab=True, no_special_chars=True)
        as_is = U.chainer(funcs=[lambda x: x])

        self.encoder = {}
        self.encoder["inp"] = [sent_enc, sent_enc]
        self.encoder["label"] = [sent_enc, U.chainer(funcs=[label_enc])]
        self.encoder["pred"] = self.encoder["inp"]
        self.encoder["mask"] = [as_is, as_is]
        self.encoder["out"] = as_is
        self.encoder["meta"] = as_is

        self.decoder = {}
        self.decoder["inp"] = [sent_dec, sent_dec]
        self.decoder["label"] = [sent_dec, U.chainer(funcs=[label_enc.decode])]
        self.decoder["pred"] = [as_is, label_enc.decode]
        self.decoder["mask"] = [as_is, as_is]
        self.decoder["out"] = [as_is, as_is]
        self.decoder["meta"] = as_is

        # Process data
        raw_data = self.upload_data()
        s = [self.__call__(d, list_input=True) for d in raw_data]
        enc.freeze()
        # d = self._create_splits(s)
        self.data_raw = raw_data
        self.logger.info("Sample Raw Example:")
        self.logger.info(self.data_raw[0])

        self.data = tuple([self._to_batches(split) for split in s])

        self.sample_data = raw_data[0][1]
        config["processor"]["params"]["vocab_size"] = len(
            enc.vocab)  # Needs changing, we might have multiple vocabs
Ejemplo n.º 3
0
def test_chainer():
    tokenizer = text.Tokenizer(sep=" ")
    enc = text.Encoder(update_vocab=True)
    chain = utils.chainer(
        funcs=[tokenizer, enc, enc.decode, tokenizer.detokenize])

    inp = "مرحبا هنا"
    assert inp == chain(inp, list_input=False)

    inp = ["hi_there man_", "how are you?"]
    assert inp == chain(inp, list_input=True)
 def __init__(self, seperator, max_vocab_size, min_vocab_count):
     cleaner = text.clean_text()
     tokenizer = text.Tokenizer(sep=seperator)
     enc = text.Encoder(
         update_vocab=True,
         no_special_chars=False,
         max_vocab_size=max_vocab_size,
         min_vocab_count=min_vocab_count,
     )
     self.enc = enc
     self.encoder = U.chainer(funcs=[cleaner, tokenizer, enc])
     self.decoder = U.chainer(funcs=[enc.decode, tokenizer.detokenize])
     self.num_classes = enc.vocab
Ejemplo n.º 5
0
def test_encoder_freezing():
    enc = text.Encoder()
    sen1 = "This is a test sent"
    sen2 = "This is a test sent!?"  # Has 2 unseen characters
    _ = enc(sen1)
    len_before = len(enc.get_vocab())
    print(len_before)
    enc.freeze()
    _ = enc(sen2)
    assert len_before == len(enc.get_vocab())
    enc.unfreeze()
    _ = enc(sen2)
    print(len(enc.get_vocab()))
    assert len_before == len(enc.get_vocab()) - 2
 def __init__(self):
     class_enc = text.Encoder(update_vocab=True, no_special_chars=True)
     self.enc = class_enc
     self.encoder = U.chainer(funcs=[class_enc])
     self.decoder = U.chainer(funcs=[class_enc.decode])