def __init__(self, seperator, max_vocab_size, min_vocab_count):
     cleaner = text.clean_text()
     tokenizer = text.Tokenizer(sep=seperator)
     enc = text.Encoder(
         update_vocab=True,
         no_special_chars=False,
         max_vocab_size=max_vocab_size,
         min_vocab_count=min_vocab_count,
     )
     self.enc = enc
     self.encoder = U.chainer(funcs=[cleaner, tokenizer, enc])
     self.decoder = U.chainer(funcs=[enc.decode, tokenizer.detokenize])
     self.num_classes = enc.vocab
Example #2
0
    def __init__(self, config):
        super(MultiLMPairProvider, self).__init__(config)
        # Setup encoding pipeline
        cleaner = text.clean_text()
        char_tokenizer = text.Tokenizer(sep="")

        enc = text.Encoder(update_vocab=True, no_special_chars=False)
        sent_enc = U.chainer(funcs=[cleaner, char_tokenizer, enc])
        sent_dec = U.chainer(funcs=[enc.decode, char_tokenizer.detokenize])
        label_enc = text.Encoder(update_vocab=True, no_special_chars=True)
        as_is = U.chainer(funcs=[lambda x: x])

        self.encoder = {}
        self.encoder["inp"] = [sent_enc, sent_enc]
        self.encoder["label"] = [sent_enc, U.chainer(funcs=[label_enc])]
        self.encoder["pred"] = self.encoder["inp"]
        self.encoder["mask"] = [as_is, as_is]
        self.encoder["out"] = as_is
        self.encoder["meta"] = as_is

        self.decoder = {}
        self.decoder["inp"] = [sent_dec, sent_dec]
        self.decoder["label"] = [sent_dec, U.chainer(funcs=[label_enc.decode])]
        self.decoder["pred"] = [as_is, label_enc.decode]
        self.decoder["mask"] = [as_is, as_is]
        self.decoder["out"] = [as_is, as_is]
        self.decoder["meta"] = as_is

        # Process data
        raw_data = self.upload_data()
        s = [self.__call__(d, list_input=True) for d in raw_data]
        enc.freeze()
        # d = self._create_splits(s)
        self.data_raw = raw_data
        self.logger.info("Sample Raw Example:")
        self.logger.info(self.data_raw[0])

        self.data = tuple([self._to_batches(split) for split in s])

        self.sample_data = raw_data[0][1]
        config["processor"]["params"]["vocab_size"] = len(
            enc.vocab)  # Needs changing, we might have multiple vocabs
Example #3
0
def test_chainer():
    tokenizer = text.Tokenizer(sep=" ")
    enc = text.Encoder(update_vocab=True)
    chain = utils.chainer(
        funcs=[tokenizer, enc, enc.decode, tokenizer.detokenize])

    inp = "مرحبا هنا"
    assert inp == chain(inp, list_input=False)

    inp = ["hi_there man_", "how are you?"]
    assert inp == chain(inp, list_input=True)
Example #4
0
    def __init__(self, config):
        super(ClsProvider, self).__init__(config)
        # Setup encoding pipeline
        cleaner = text.clean_text()
        tokenizer = text.Tokenizer(sep=self.args["separator"])
        # word_tokenizer = text.Tokenizer(sep=' ')

        self.input_col_name = self.args["input_col_name"]
        self.label_col_name = self.args["label_col_name"]

        enc = text.Encoder(update_vocab=True, no_special_chars=False)
        label_enc = text.Encoder(update_vocab=True, no_special_chars=True)

        self.encoder = {}
        self.encoder["inp"] = [U.chainer(funcs=[cleaner, tokenizer, enc])]
        self.encoder["label"] = [U.chainer(funcs=[label_enc])]
        self.encoder["pred"] = [U.chainer(funcs=[label_enc])]
        self.encoder["mask"] = [U.chainer(funcs=[lambda x: x])]
        self.encoder["out"] = self.encoder["mask"]
        self.encoder["meta"] = self.encoder["mask"]

        self.decoder = {}
        self.decoder["inp"] = [
            U.chainer(funcs=[enc.decode, tokenizer.detokenize])
        ]
        self.decoder["label"] = [U.chainer(funcs=[label_enc.decode])]
        self.decoder["pred"] = [U.chainer(funcs=[label_enc.decode])]
        self.decoder["mask"] = [U.chainer(funcs=[lambda x: x])]
        self.decoder["out"] = self.decoder["mask"]
        self.decoder["meta"] = self.decoder["mask"]

        # Process data
        raw_data = self.upload_data()
        processed = [self.__call__(d, list_input=True) for d in raw_data]
        enc.freeze()
        # d = self._create_splits(s)
        self.data_raw = raw_data
        self.data = tuple([self._to_batches(split) for split in processed])

        self.sample_data = raw_data[0][1]
        self.logger.info(f"Vocab size: {len(enc.vocab)}")
        self.logger.debug(f"Vocab: {enc.vocab}")
        self.logger.debug(
            f"Sample data: \n Raw: {raw_data[0][1]} \n Encoded: {self(raw_data[0][1])}"
        )
        config["processor"]["params"]["vocab_size"] = len(
            enc.vocab)  # Needs changing, we might have multiple vocabs
 def __init__(self):
     class_enc = text.Encoder(update_vocab=True, no_special_chars=True)
     self.enc = class_enc
     self.encoder = U.chainer(funcs=[class_enc])
     self.decoder = U.chainer(funcs=[class_enc.decode])
    def __init__(self, config):
        super(MultiTaskProvider, self).__init__(config)
        # All text fields share the text encoder
        # Each label has a new encoder

        # Setup encoding pipeline
        self.task_order = config["processor"]["params"]["task_order"]
        self.label_indx = config["processor"]["params"]["label_indx"]
        self.label_encoder = config["processor"]["params"]["label_encoder"]
        self.masks = config["processor"]["params"]["mask_weights"]
        self.down_weight_classes = config["processor"]["params"][
            "down_weight_classes"]

        # Loaders
        self.loaders = []
        for l in config["processor"]["params"]["loaders"]:
            # bad, this should be passed in a better way
            l["params"]["data_path"] = config["data_path"]
            self.loaders.append(
                U.load_class(l["module"],
                             l["class"],
                             l["params"],
                             pass_params_as_dict=True))

        # Text pipeline

        _tmodule = config["encoder"]["module"]
        _tclass = config["encoder"]["class"]
        _tparams = config["encoder"]["params"]
        text_pipe = U.load_class(_tmodule,
                                 _tclass,
                                 _tparams,
                                 pass_params_as_dict=True)

        # self.sep = config["processor"]["params"]["separator"]
        self.num_labels = len(self.label_indx)
        # self.max_vocab_size = config["processor"]["params"]["max_vocab_size"]
        # self.min_vocab_count = config["processor"]["params"]["min_vocab_count"]
        # text_pipe = pipeline.BertTextPipeline(
        #    sep=self.sep,
        #    max_vocab_size=self.max_vocab_size,
        #    min_vocab_count=self.min_vocab_count,
        # )
        # Labels pipeline

        as_is = U.chainer(funcs=[lambda x, list_input: x])

        self.encoder = {}
        self.decoder = {}

        self.encoder["inp"] = [text_pipe.encoder]
        self.decoder["inp"] = [text_pipe.decoder]

        self.encoder["label"] = []
        self.decoder["label"] = []
        self.decoder["pred"] = []

        self.pipelines = []

        for label in self.label_encoder:
            if label == "text":
                # All labels of type text will share the same text pipeline (tokenizer / vocab)
                self.logger.info("Adding text encoder")
                self.pipelines.append(text_pipe)
                self.encoder["label"].append(text_pipe.encoder)
                self.decoder["label"].append(text_pipe.decoder)
                self.decoder["pred"].append(text_pipe.decoder)
            elif label == "class":
                self.logger.info("Adding class encoder")
                cls_pipe = pipeline.ClassPipeline()
                self.pipelines.append(cls_pipe)
                self.encoder["label"].append(cls_pipe.encoder)
                self.decoder["label"].append(cls_pipe.decoder)
                self.decoder["pred"].append(cls_pipe.decoder)
            else:
                raise ValueError("Label_encoder can be either text or class")

        # self.encoder['pred'] = self.encoder['inp']
        self.encoder["pred"] = self.encoder["label"]
        self.encoder["mask"] = [as_is for _ in range(self.num_labels)]
        self.encoder["out"] = as_is
        self.encoder["meta"] = as_is

        # self.decoder['pred'] = [as_is, class_enc.decode]

        self.decoder["mask"] = [as_is for _ in range(self.num_labels)]
        self.decoder["out"] = [as_is for _ in range(self.num_labels)]
        self.decoder["meta"] = [as_is]

        # Process data
        raw_data = self.upload_data()

        # Build vocab through first round over data
        # text_pipe.enc.build_vocab(raw_data)

        self.logger.info(f"Splits: {len(raw_data)}")

        for i, split in enumerate(raw_data):
            self.logger.info(f"Split {i} size: {len(split['inp'][0])}")

        # print(raw_data[0])
        # Process data
        s = [self.__call__(d, list_input=True) for d in raw_data]

        # text_pipe.enc.filter_vocab()

        # text_pipe.enc.freeze()

        # Now encode data
        s = [self.__call__(d, list_input=True) for d in raw_data]

        num_classes = []
        class_weights = []

        for i, l_encoder in enumerate(self.pipelines):
            # self.logger.info(f"Label encoder {l_encoder.enc.vocab}")
            num_classes.append(l_encoder.get_num_classes())
            self.logger.debug(f"Class Distribution for task {i}")
            self.logger.debug(l_encoder.get_vocab_counts(as_list=True))
            self.logger.debug(l_encoder.get_vocab_counts(as_list=False))
            self.logger.debug(
                l_encoder.get_vocab_weights(as_list=False,
                                            min_w=self.down_weight_classes))
            self.logger.debug(
                l_encoder.get_vocab_weights(as_list=True,
                                            min_w=self.down_weight_classes))
            class_weights.append(
                l_encoder.get_vocab_weights(as_list=True,
                                            min_w=self.down_weight_classes))

        config["processor"]["params"]["num_classes"] = num_classes

        # TODO: Move this to a better place
        class_weights = [
            torch.Tensor(cw).to(config["device"]) for cw in class_weights
        ]
        config["processor"]["params"]["class_weights"] = class_weights
        self.logger.debug(class_weights)
        self.logger.info(f"Number of classes of output: {num_classes}")

        # text_pipe.enc.freeze()
        # d = self._create_splits(s)
        self.data_raw = raw_data
        self.data = tuple([self._to_batches(split) for split in s])

        # self.sample_data_raw = raw_data[0][1]
        self.sample_data_raw = self.get_sample(0)
        print(self.sample_data_raw)

        # self.sample_data_processed = s[0][1]
        config["processor"]["params"]["vocab_size"] = len(
            text_pipe.enc.vocab
        )  # Needs changing, we might have multiple vocabs
        self.logger.info(f"Vocab size: {len(text_pipe.enc.vocab)}")
        self.logger.info("First 10 vocab words:")
        self.logger.info(list(text_pipe.enc.vocab.items())[:10])
        self.logger.info("Top frequent words:")
        # self.logger.info(text_pipe.enc.wc.most_common(20))
        config["processor"]["params"]["padding_indx"] = 0