def __init__(self, seperator, max_vocab_size, min_vocab_count): cleaner = text.clean_text() tokenizer = text.Tokenizer(sep=seperator) enc = text.Encoder( update_vocab=True, no_special_chars=False, max_vocab_size=max_vocab_size, min_vocab_count=min_vocab_count, ) self.enc = enc self.encoder = U.chainer(funcs=[cleaner, tokenizer, enc]) self.decoder = U.chainer(funcs=[enc.decode, tokenizer.detokenize]) self.num_classes = enc.vocab
def __init__(self, config): super(MultiLMPairProvider, self).__init__(config) # Setup encoding pipeline cleaner = text.clean_text() char_tokenizer = text.Tokenizer(sep="") enc = text.Encoder(update_vocab=True, no_special_chars=False) sent_enc = U.chainer(funcs=[cleaner, char_tokenizer, enc]) sent_dec = U.chainer(funcs=[enc.decode, char_tokenizer.detokenize]) label_enc = text.Encoder(update_vocab=True, no_special_chars=True) as_is = U.chainer(funcs=[lambda x: x]) self.encoder = {} self.encoder["inp"] = [sent_enc, sent_enc] self.encoder["label"] = [sent_enc, U.chainer(funcs=[label_enc])] self.encoder["pred"] = self.encoder["inp"] self.encoder["mask"] = [as_is, as_is] self.encoder["out"] = as_is self.encoder["meta"] = as_is self.decoder = {} self.decoder["inp"] = [sent_dec, sent_dec] self.decoder["label"] = [sent_dec, U.chainer(funcs=[label_enc.decode])] self.decoder["pred"] = [as_is, label_enc.decode] self.decoder["mask"] = [as_is, as_is] self.decoder["out"] = [as_is, as_is] self.decoder["meta"] = as_is # Process data raw_data = self.upload_data() s = [self.__call__(d, list_input=True) for d in raw_data] enc.freeze() # d = self._create_splits(s) self.data_raw = raw_data self.logger.info("Sample Raw Example:") self.logger.info(self.data_raw[0]) self.data = tuple([self._to_batches(split) for split in s]) self.sample_data = raw_data[0][1] config["processor"]["params"]["vocab_size"] = len( enc.vocab) # Needs changing, we might have multiple vocabs
def test_chainer(): tokenizer = text.Tokenizer(sep=" ") enc = text.Encoder(update_vocab=True) chain = utils.chainer( funcs=[tokenizer, enc, enc.decode, tokenizer.detokenize]) inp = "مرحبا هنا" assert inp == chain(inp, list_input=False) inp = ["hi_there man_", "how are you?"] assert inp == chain(inp, list_input=True)
def __init__(self, config): super(ClsProvider, self).__init__(config) # Setup encoding pipeline cleaner = text.clean_text() tokenizer = text.Tokenizer(sep=self.args["separator"]) # word_tokenizer = text.Tokenizer(sep=' ') self.input_col_name = self.args["input_col_name"] self.label_col_name = self.args["label_col_name"] enc = text.Encoder(update_vocab=True, no_special_chars=False) label_enc = text.Encoder(update_vocab=True, no_special_chars=True) self.encoder = {} self.encoder["inp"] = [U.chainer(funcs=[cleaner, tokenizer, enc])] self.encoder["label"] = [U.chainer(funcs=[label_enc])] self.encoder["pred"] = [U.chainer(funcs=[label_enc])] self.encoder["mask"] = [U.chainer(funcs=[lambda x: x])] self.encoder["out"] = self.encoder["mask"] self.encoder["meta"] = self.encoder["mask"] self.decoder = {} self.decoder["inp"] = [ U.chainer(funcs=[enc.decode, tokenizer.detokenize]) ] self.decoder["label"] = [U.chainer(funcs=[label_enc.decode])] self.decoder["pred"] = [U.chainer(funcs=[label_enc.decode])] self.decoder["mask"] = [U.chainer(funcs=[lambda x: x])] self.decoder["out"] = self.decoder["mask"] self.decoder["meta"] = self.decoder["mask"] # Process data raw_data = self.upload_data() processed = [self.__call__(d, list_input=True) for d in raw_data] enc.freeze() # d = self._create_splits(s) self.data_raw = raw_data self.data = tuple([self._to_batches(split) for split in processed]) self.sample_data = raw_data[0][1] self.logger.info(f"Vocab size: {len(enc.vocab)}") self.logger.debug(f"Vocab: {enc.vocab}") self.logger.debug( f"Sample data: \n Raw: {raw_data[0][1]} \n Encoded: {self(raw_data[0][1])}" ) config["processor"]["params"]["vocab_size"] = len( enc.vocab) # Needs changing, we might have multiple vocabs
def __init__(self): class_enc = text.Encoder(update_vocab=True, no_special_chars=True) self.enc = class_enc self.encoder = U.chainer(funcs=[class_enc]) self.decoder = U.chainer(funcs=[class_enc.decode])
def __init__(self, config): super(MultiTaskProvider, self).__init__(config) # All text fields share the text encoder # Each label has a new encoder # Setup encoding pipeline self.task_order = config["processor"]["params"]["task_order"] self.label_indx = config["processor"]["params"]["label_indx"] self.label_encoder = config["processor"]["params"]["label_encoder"] self.masks = config["processor"]["params"]["mask_weights"] self.down_weight_classes = config["processor"]["params"][ "down_weight_classes"] # Loaders self.loaders = [] for l in config["processor"]["params"]["loaders"]: # bad, this should be passed in a better way l["params"]["data_path"] = config["data_path"] self.loaders.append( U.load_class(l["module"], l["class"], l["params"], pass_params_as_dict=True)) # Text pipeline _tmodule = config["encoder"]["module"] _tclass = config["encoder"]["class"] _tparams = config["encoder"]["params"] text_pipe = U.load_class(_tmodule, _tclass, _tparams, pass_params_as_dict=True) # self.sep = config["processor"]["params"]["separator"] self.num_labels = len(self.label_indx) # self.max_vocab_size = config["processor"]["params"]["max_vocab_size"] # self.min_vocab_count = config["processor"]["params"]["min_vocab_count"] # text_pipe = pipeline.BertTextPipeline( # sep=self.sep, # max_vocab_size=self.max_vocab_size, # min_vocab_count=self.min_vocab_count, # ) # Labels pipeline as_is = U.chainer(funcs=[lambda x, list_input: x]) self.encoder = {} self.decoder = {} self.encoder["inp"] = [text_pipe.encoder] self.decoder["inp"] = [text_pipe.decoder] self.encoder["label"] = [] self.decoder["label"] = [] self.decoder["pred"] = [] self.pipelines = [] for label in self.label_encoder: if label == "text": # All labels of type text will share the same text pipeline (tokenizer / vocab) self.logger.info("Adding text encoder") self.pipelines.append(text_pipe) self.encoder["label"].append(text_pipe.encoder) self.decoder["label"].append(text_pipe.decoder) self.decoder["pred"].append(text_pipe.decoder) elif label == "class": self.logger.info("Adding class encoder") cls_pipe = pipeline.ClassPipeline() self.pipelines.append(cls_pipe) self.encoder["label"].append(cls_pipe.encoder) self.decoder["label"].append(cls_pipe.decoder) self.decoder["pred"].append(cls_pipe.decoder) else: raise ValueError("Label_encoder can be either text or class") # self.encoder['pred'] = self.encoder['inp'] self.encoder["pred"] = self.encoder["label"] self.encoder["mask"] = [as_is for _ in range(self.num_labels)] self.encoder["out"] = as_is self.encoder["meta"] = as_is # self.decoder['pred'] = [as_is, class_enc.decode] self.decoder["mask"] = [as_is for _ in range(self.num_labels)] self.decoder["out"] = [as_is for _ in range(self.num_labels)] self.decoder["meta"] = [as_is] # Process data raw_data = self.upload_data() # Build vocab through first round over data # text_pipe.enc.build_vocab(raw_data) self.logger.info(f"Splits: {len(raw_data)}") for i, split in enumerate(raw_data): self.logger.info(f"Split {i} size: {len(split['inp'][0])}") # print(raw_data[0]) # Process data s = [self.__call__(d, list_input=True) for d in raw_data] # text_pipe.enc.filter_vocab() # text_pipe.enc.freeze() # Now encode data s = [self.__call__(d, list_input=True) for d in raw_data] num_classes = [] class_weights = [] for i, l_encoder in enumerate(self.pipelines): # self.logger.info(f"Label encoder {l_encoder.enc.vocab}") num_classes.append(l_encoder.get_num_classes()) self.logger.debug(f"Class Distribution for task {i}") self.logger.debug(l_encoder.get_vocab_counts(as_list=True)) self.logger.debug(l_encoder.get_vocab_counts(as_list=False)) self.logger.debug( l_encoder.get_vocab_weights(as_list=False, min_w=self.down_weight_classes)) self.logger.debug( l_encoder.get_vocab_weights(as_list=True, min_w=self.down_weight_classes)) class_weights.append( l_encoder.get_vocab_weights(as_list=True, min_w=self.down_weight_classes)) config["processor"]["params"]["num_classes"] = num_classes # TODO: Move this to a better place class_weights = [ torch.Tensor(cw).to(config["device"]) for cw in class_weights ] config["processor"]["params"]["class_weights"] = class_weights self.logger.debug(class_weights) self.logger.info(f"Number of classes of output: {num_classes}") # text_pipe.enc.freeze() # d = self._create_splits(s) self.data_raw = raw_data self.data = tuple([self._to_batches(split) for split in s]) # self.sample_data_raw = raw_data[0][1] self.sample_data_raw = self.get_sample(0) print(self.sample_data_raw) # self.sample_data_processed = s[0][1] config["processor"]["params"]["vocab_size"] = len( text_pipe.enc.vocab ) # Needs changing, we might have multiple vocabs self.logger.info(f"Vocab size: {len(text_pipe.enc.vocab)}") self.logger.info("First 10 vocab words:") self.logger.info(list(text_pipe.enc.vocab.items())[:10]) self.logger.info("Top frequent words:") # self.logger.info(text_pipe.enc.wc.most_common(20)) config["processor"]["params"]["padding_indx"] = 0