def __init__(self, tokenizer, n_classes, min_bucket=5, max_bucket=45, bucket_steps=5, preprocessing=True, multi_label=False): self.min_bucket = min_bucket self.max_bucket = max_bucket self.bucket_steps = bucket_steps self.buckets = ut.single_finetuning_bucketing(self.min_bucket, self.max_bucket, self.bucket_steps) self.tokenizer = tokenizer self.n_classes = n_classes self.multi_label = multi_label self.preprocessing = ut.preprocessing() if preprocessing else None
def generator(self, ids, x1, x2, y): bucked_samples = {} n_samples = len(x1) for i in range(n_samples): x1_i = x1[i] x2_i = x2[i] if self.preprocessing: x1_i = self.preprocessing(x1_i) x2_i = self.preprocessing(x2_i) x1_tok = ut.tokenize(x1_i, self.tokenizer) x2_tok = ut.tokenize(x2_i, self.tokenizer) bucked_samples = self.buckets(ids[i], x1_tok, x2_tok, y[i]) while True: for bucket in bucked_samples: bucket_size = len(bucked_samples[bucket]) bucket_1 = bucket[0] bucket_2 = bucket[1] position_indices = list(range((bucket_1 + bucket_2 + 3))) position_indices = np.array( [position_indices for _ in range(bucket_size)], dtype="int32") segment_indices = [0 for _ in range(bucket_1 + 2)] + \ [1 for _ in range(0, bucket_2 + 1, 1)] segment_indices = np.array( [segment_indices for _ in range(bucket_size)], dtype="int32") batch_x = np.zeros((bucket_size, bucket_1 + bucket_2 + 3), dtype="int32") batch_y = np.zeros((bucket_size, ), dtype="int32") for i in range(bucket_size): ids_i, x1_i, x2_i, y_i = bucked_samples[bucket][i] x = ut.prepare_input(x1_i, x2_i) x_ids = convert_tokens_to_ids(self.tokenizer.vocab, x) batch_x[i] = x_ids batch_y[i] = y_i p = np.random.permutation(bucket_size) batch_x = batch_x[p] batch_y = batch_y[p] batch_y = to_categorical(batch_y, num_classes=self.n_classes) yield ([batch_x, position_indices, segment_indices], batch_y)
def __init__(self, tokenizer, n_classes, min_bucket_a=5, min_bucket_b=5, max_bucket_a=45, max_bucket_b=45, bucket_steps=5, preprocessing=True): self.min_bucket_a = min_bucket_a self.min_bucket_b = min_bucket_b self.max_bucket_a = max_bucket_a self.max_bucket_b = max_bucket_b self.bucket_steps = bucket_steps self.buckets = ut.multiple_finetuning_bucketing( self.min_bucket_a, self.min_bucket_b, self.max_bucket_a, self.max_bucket_b, self.bucket_steps) self.tokenizer = tokenizer self.n_classes = n_classes self.preprocessing = ut.preprocessing() if preprocessing else None
def generator(self, ids, x, y): bucked_samples = {} lx = len(x) for i in range(lx): x_i = x[i] if self.preprocessing: x_i = self.preprocessing(x_i) x_tok = ut.tokenize(x_i, self.tokenizer) bucked_samples = self.buckets(ids[i], x_tok, y[i]) while True: for bucket in bucked_samples: bucket_size = len(bucked_samples[bucket]) position_indices = list(range((bucket + 2))) position_indices = np.array( [position_indices for _ in range(bucket_size)], dtype="int32") segment_indices = [0 for _ in range(bucket + 2)] segment_indices = np.array( [segment_indices for _ in range(bucket_size)], dtype="int32") batch_x = np.zeros((bucket_size, bucket + 2), dtype="int32") if self.multi_label: batch_y = np.zeros((bucket_size, self.n_classes), dtype="int32") else: batch_y = np.zeros((bucket_size, ), dtype="int32") for i in range(bucket_size): ids_i, x_i, y_i = bucked_samples[bucket][i] x_i = ut.prepare_single_input(x_i) x_ids = convert_tokens_to_ids(self.tokenizer.vocab, x_i) batch_x[i] = x_ids batch_y[i] = y_i p = np.random.permutation(bucket_size) batch_x = batch_x[p] batch_y = batch_y[p] if not self.multi_label: batch_y = to_categorical(batch_y, num_classes=self.n_classes) yield ([batch_x, position_indices, segment_indices], batch_y)
def generator(self): while True: fr = open(self.dataset_file, "r", encoding="utf8") fr.readline() for line in fr.readlines(): id_, text, id_reply, reply = line.strip().split("\t") text, reply = text.strip(), reply.strip() text = ut.tokenize(text, self.tokenizer) reply = ut.tokenize(reply, self.tokenizer) batch = self.buckets(text, reply, y=1) res = self.__batching(batch) if res is not None: yield res batch = self.buckets(reply, text, y=0) res = self.__batching(batch) if res is not None: yield res fr.close()
def __init__(self, dataset_file, tokenizer, batch_size, mlm_type, mlm_max_span, mask_prob, probs_mlm, min_bucket_a, min_bucket_b, max_bucket_a, max_bucket_b, bucket_steps, use_rop): self.dataset_file = dataset_file self.batch_size = batch_size if batch_size % 2 == 0 else batch_size + 1 self.min_bucket_a = min_bucket_a self.min_bucket_b = min_bucket_b self.max_bucket_a = max_bucket_a self.max_bucket_b = max_bucket_b self.bucket_steps = bucket_steps self.buckets = ut.bucketing(self.min_bucket_a, self.min_bucket_b, self.max_bucket_a, self.max_bucket_b, self.bucket_steps, self.batch_size) self.mask_prob = mask_prob self.probs_mlm = probs_mlm self.mlm_type = mlm_type self.mlm_max_span = mlm_max_span self.tokenizer = tokenizer self.vocab_words = list(self.tokenizer.vocab.keys())[5:] self.vocab_size = len(self.vocab_words) self.use_rop = use_rop
pkm = config["model"]["pkm"] pkm_params = config["model"]["pkm_params"] use_rop = config["model"]["rop"]["use_rop"] rop_n_hidden = config["model"]["rop"]["n_hidden"] rop_hidden_size = config["model"]["rop"]["hidden_size"] output_encoder_size = [hidden_size for i in range(n_encoders)] attention_size = [attention_size for i in range(n_encoders)] n_heads = [n_heads for i in range(n_encoders)] ################################## # Load Data # ids_tr, x_tr, y_tr = ut.load_dataset(train_file, id_header, text_header, class_header, categories, multi_label, delimiter) ids_dv, x_dv, y_dv = ut.load_dataset(dev_file, id_header, text_header, class_header, categories, multi_label, delimiter) ids_ts, x_ts, y_ts = ut.load_dataset(test_file, id_header, text_header, class_header, categories, multi_label, delimiter) if multi_label: n_classes = len(y_tr[0]) gen_tr = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min, bucket_max, bucket_steps, preprocessing, multi_label) gen_dv = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min,
None, None, pkm, pkm_params, input_length=None, use_rop=use_rop) twilbert_model.build() model = twilbert_model.model twilbert_model.compile(model) twilbert_model.load(model, path_load_weights) print(model.summary()) dataset = ut.load_lm_dataset(dataset_file) preprocess = ut.preprocessing() dataset = [ut.tokenize(preprocess(text), tokenizer) for text in dataset] gamma = 0. N = len(dataset) for i in range(N): if i % 50 == 0: print("T=%d P(X)=%.3f" % (i + 1, (gamma / (i + 1)))) X = ut.prepare_single_input(dataset[i]) # Añadir [CLS] y [SEP] T = len(X) # Cada muestra X tiene tantos posibles enmascaramientos como |X| # maskings = [ut.mask_lm_eval(X, t) for t in range(T)][1:-1] c = 1 alpha = 0 for masking in maskings: x = convert_tokens_to_ids(tokenizer.vocab, masking)
def __batching(self, batch): if batch is not None: x1, x2, y = batch[0], batch[1], batch[2] bucket_1, bucket_2 = len(x1[0]), len(x2[0]) batch_x = np.zeros((self.batch_size, bucket_1 + bucket_2 + 3), dtype="int32") batch_rop = np.zeros(self.batch_size, dtype="int32") batch_mlm = np.zeros((self.batch_size, bucket_1 + bucket_2 + 3), dtype="int32") position_indices = list(range((bucket_1 + bucket_2 + 3))) position_indices = np.array( [position_indices for _ in range(self.batch_size)], dtype="int32") segment_indices = [0 for _ in range(bucket_1 + 2)] + \ [1 for _ in range(0, bucket_2 + 1, 1)] segment_indices = np.array( [segment_indices for _ in range(self.batch_size)], dtype="int32") for i in range(self.batch_size): x = ut.prepare_input(x1[i], x2[i]) x_ids = convert_tokens_to_ids(self.tokenizer.vocab, x) masked_x, mask = None, None if self.mlm_type == "token": try: masked_x, mask = ut.mask_tokens( x, self.mask_prob, self.probs_mlm, self.vocab_words) except: print("Error sample") continue elif self.mlm_type == "span": try: masked_x, mask = ut.mask_spans(x, self.mask_prob, self.probs_mlm, self.vocab_words, self.mlm_max_span) except: print("Error sample") continue mask = np.array(mask, dtype="int") masked_x_ids = convert_tokens_to_ids(self.tokenizer.vocab, masked_x) mlm_output = ut.prepare_mlm_output(x_ids, mask) batch_x[i] = masked_x_ids batch_mlm[i] = mlm_output batch_rop[i] = y[i] p = np.random.permutation(self.batch_size) batch_x = batch_x[p] batch_rop = batch_rop[p] batch_mlm = batch_mlm[p] batch_mlm = np.expand_dims(batch_mlm, -1) if self.use_rop: return ([batch_x, position_indices, segment_indices], [batch_rop, batch_mlm]) else: return ([batch_x, position_indices, segment_indices], [batch_mlm])
model = twilbert_model.pretrained_model print(model.summary()) fr = open(dataset_file, "r", encoding="utf8") fr.readline() tweets = [] replies = [] labels = [] for line in fr.readlines(): sline = line.strip().split("\t") tweets.append(sline[0].strip()) replies.append(sline[1].strip()) labels.append(int(sline[2].strip())) preprocess = ut.preprocessing() tweets = [ut.tokenize(preprocess(text), tokenizer) for text in tweets] replies = [ut.tokenize(preprocess(text), tokenizer) for text in replies] N = len(tweets) embeddings = [] for i in range(N): X = ut.prepare_input(tweets[i], replies[i]) indices = convert_tokens_to_ids(tokenizer.vocab, X) position_indices = list(range(len(X))) segment_indices = [0 for _ in range(len(tweets[i]) + 2)] + \ [1 for _ in range(0, len(replies[i]) + 1, 1)] pred = model.predict([ np.array([indices]), np.array([position_indices]), np.array([segment_indices]) ])[0]
pkm, pkm_params, input_length=None) twilbert_model.build() model = twilbert_model.model pretrained_model = twilbert_model.pretrained_model twilbert_model.compile(model) model.load_weights(pretrained_model_weights) ######################### # Load Data # ids_tr, x1_tr, x2_tr, y_tr = ut.load_multiple_dataset( train_file, id_header, text_header, aux_header, class_header, categories, delimiter) ids_dv, x1_dv, x2_dv, y_dv = ut.load_multiple_dataset( dev_file, id_header, text_header, aux_header, class_header, categories, delimiter) ids_ts, x1_ts, x2_ts, y_ts = ut.load_multiple_dataset( test_file, id_header, text_header, aux_header, class_header, categories, delimiter) gen_tr = MultipleFinetuningGenerator(tokenizer, n_classes, bucket_min_a, bucket_min_b, bucket_max_a, bucket_max_b, bucket_steps, preprocessing)
pkm, pkm_params, input_length=None) twilbert_model.build() model = twilbert_model.model pretrained_model = twilbert_model.pretrained_model twilbert_model.compile(model) model.load_weights(pretrained_model_weights) ######################### # Load Data # ids_tr, x_tr, y_tr = ut.load_dataset(train_file, id_header, text_header, class_header, categories, multi_label, delimiter) ids_dv, x_dv, y_dv = ut.load_dataset(dev_file, id_header, text_header, class_header, categories, multi_label, delimiter) ids_ts, x_ts, y_ts = ut.load_dataset(test_file, id_header, text_header, class_header, categories, multi_label, delimiter) if multi_label: n_classes = len(y_tr[0]) gen_tr = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min, bucket_max, bucket_steps, preprocessing, multi_label) gen_dv = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min,