def main(_): trn_snt_files = [ # '../datasets/training/as_simplified_training.utf8', '../datasets/training/cityu_simplified_training.utf8', '../datasets/training/msr_training.utf8', '../datasets/training/pku_training.utf8' ] trn_lbl_files = [splitext(f)[0] + '.bies' for f in trn_snt_files] tf.logging.info('Loading training data...') trn_snts = read_sentences(trn_snt_files) y_trn = read_labels(trn_lbl_files) assert len(trn_snts) == len(y_trn), 'Sentences and labels must be equal' train_tok = False tokenizer = Tokenizer(trn_snts, verbose=True) if train_tok: tokenizer.fit() tokenizer.save() else: tokenizer.load() x_uni_trn, x_bi_trn = process_sentences(trn_snts, tokenizer) tf.logging.info('Creating model...') model = create_model(tokenizer.vocab_size(), stacked=False) model.summary() tf.logging.info('Training model...') epochs = 10 batch_size = 32 steps = int(len(x_uni_trn) / batch_size) for epoch in range(epochs): print('Epoch', epoch + 1) for uni_b, bi_b, lbl_b in tqdm(train_data_generator( [x_uni_trn, x_bi_trn, y_trn], batch_size, shuffle=True), desc='Training Loop', total=steps): try: loss, acc = model.train_on_batch([uni_b, bi_b], lbl_b) # print('Loss:', loss, 'Acc:', acc) except Exception as e: print(e) model.save('unstacked_combined_model.h5')
def get_training_data(cfg): if cfg.input_data_dir(): filenames = glob.glob(cfg.input_data_dir() + os.sep + "*.csv") for f in filenames: cfg.logger.info("reading and combining files:" + f) df = pd.concat([pd.read_csv(f) for f in filenames]) else: cfg.logger.info("No training data dir provided") sys.exit() cfg.logger.info("input data frame:" + str(df.shape)) df = df.astype(str) df_train, df_test = train_test_split(df, test_size =\ cfg.test_train_split(), random_state=cfg.random_seed()) x = df[cfg.input_col()].tolist() y = df[cfg.output_col()].tolist() input_pp = Tokenizer(cfg.num_input_tokens()) output_pp = Tokenizer(cfg.num_output_tokens()) input_pp.fit(x) output_pp.fit(y) x_train = df_train[cfg.input_col()].tolist() y_train = df_train[cfg.output_col()].tolist() input_vecs = input_pp.transform(x_train, cfg.input_seq_len(),\ padding=True, post=False, append_indicators=False) output_vecs = output_pp.transform(y_train, cfg.output_seq_len(),\ padding=True, post=True, append_indicators=True) return input_vecs, output_vecs
class AttentionTFIDFClassifier(BaseEstimator, ClassifierMixin): def __init__(self, hiddens=300, mindf=2, lan='english', stopwords='nltk', k=512, max_drop=.85, batch_size=64, lr=5e-3, weight_decay=5e-3, nepochs=1000, patience=10, factor=.95, vocab_max_size=300000, n_jobs=cpu_count(), _device=torch.device('cuda:0'), _verbose=False): super(AttentionTFIDFClassifier, self).__init__() self._model = None self._tokenizer = None self.nepochs = int(nepochs) self.hiddens = int(hiddens) self.mindf = int(mindf) self.lan = lan self.stopwords = stopwords self.k = int(k) self.max_drop = max_drop self.vocab_max_size = vocab_max_size self._verbose = _verbose self._device = _device self.n_jobs = int(n_jobs) self.lr = lr self.weight_decay = weight_decay self.patience = int(patience) self.factor = factor self.batch_size = int(batch_size) def collate_train(param): X, y = zip(*param) y = self._tokenizer.le.transform(y) doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False) doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)), batch_first=True, padding_value=0) TFs = pad_sequence(list(map(torch.tensor, TFs)), batch_first=True, padding_value=0) TFs = torch.LongTensor(torch.log2(TFs + 1).round().long()) DFs = pad_sequence(list(map(torch.tensor, DFs)), batch_first=True, padding_value=0) DFs = torch.LongTensor(torch.log2(DFs + 1).round().long()) return doc_tids, TFs, DFs, torch.LongTensor(y) def collate_predict(X): doc_tids, TFs, DFs = self._tokenizer.transform(X, verbose=False) doc_tids = pad_sequence(list(map(torch.LongTensor, doc_tids)), batch_first=True, padding_value=0) TFs = pad_sequence(list(map(torch.tensor, TFs)), batch_first=True, padding_value=0) TFs = torch.LongTensor(torch.log2(TFs + 1).round().long()) DFs = pad_sequence(list(map(torch.tensor, DFs)), batch_first=True, padding_value=0) DFs = torch.LongTensor(torch.log2(DFs + 1).round().long()) return doc_tids, TFs, DFs self.collate_train = collate_train self.collate_predict = collate_predict def fit(self, X_train, y_train, X_val=None, y_val=None): if X_val is None or y_val is None: pass self._tokenizer = Tokenizer(mindf=self.mindf, lan=self.lan, stopwordsSet=self.stopwords, model='sample', k=self.k, verbose=self._verbose) self._tokenizer.fit(X_train, y_train) self.maxF = int(round(np.log2(self._tokenizer.maxF + 1))) self._model = AttentionTFIDF(vocab_size=self._tokenizer.vocab_size, hiddens=self.hiddens, nclass=self._tokenizer.n_class, maxF=self.maxF, drop=self.max_drop).to(self._device) optimizer = optim.AdamW(self._model.parameters(), lr=self.lr, weight_decay=self.weight_decay) loss_func_cel = nn.CrossEntropyLoss().to(self._device) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=self.factor, patience=3, verbose=self._verbose) best = 99999. best_acc = 0. counter = 1 dl_val = DataLoader(list(zip(X_val, y_val)), batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_train, num_workers=self.n_jobs) for e in tqdm(range(self.nepochs), total=self.nepochs, disable=not self._verbose): dl_train = DataLoader(list(zip(X_train, y_train)), batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_train, num_workers=self.n_jobs) loss_train = 0. with tqdm(total=len(y_train) + len(y_val), smoothing=0., desc=f"ACC_val: {best_acc:.2} Epoch {e+1}", disable=not self._verbose) as pbar: total = 0 correct = 0 self._model.train() self._tokenizer.model = 'sample' for i, (doc_tids, TFs, DFs, y) in enumerate(dl_train): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) y = y.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax(pred_docs, dim=1) loss = loss_func_cel(pred_docs, y) optimizer.zero_grad() loss.backward() optimizer.step() loss_train += loss.item() total += len(y) y_pred = pred_docs.argmax(axis=1) correct += (y_pred == y).sum().item() self._model.drop_ = (correct / total) * self.max_drop pbar.update(len(y)) del doc_tids, TFs del DFs, y, pred_docs del loss, y_pred loss_train = loss_train / (i + 1) total = 0 correct = 0 self._model.eval() self._tokenizer.model = 'topk' with torch.no_grad(): loss_val = 0. for i, (doc_tids, TFs, DFs, y) in enumerate(dl_val): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) y = y.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax(pred_docs, dim=1) loss = loss_func_cel(pred_docs, y) loss_val += loss.item() total += len(y) y_pred = pred_docs.argmax(axis=1) correct += (y_pred == y).sum().item() pbar.update(len(y)) loss_val del doc_tids, TFs, DFs, y del pred_docs, loss loss_val = (loss_val / (i + 1)) scheduler.step(loss_val) if best - loss_val > 0.0001: best = loss_val counter = 1 best_acc = correct / total best_model = copy.deepcopy(self._model).to('cpu') elif counter > self.patience: break else: counter += 1 self._model = best_model.to(self._device) self._loss = best self._acc = best_acc return self def predict(self, X): if self._model is None or self._tokenizer is None: raise Exception("Not implemented yet!") self._model.eval() self._tokenizer.model = 'topk' dataloader = DataLoader(X, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_predict, num_workers=self.n_jobs) result = [] with torch.no_grad(): loss_val = 0. for i, (doc_tids, TFs, DFs) in enumerate(dataloader): doc_tids = doc_tids.to(self._device) TFs = TFs.to(self._device) DFs = DFs.to(self._device) pred_docs, _, _ = self._model(doc_tids, TFs, DFs) pred_docs = torch.softmax( pred_docs, dim=1).argmax(axis=1).cpu().detach().numpy() result.extend(list(pred_docs)) return self._tokenizer.le.inverse_transform(np.array(result)) def to(self, device): self._device = device if self._model is not None: self._model.to(self._device) return self
# 纯字Embedding # 92%+ # 处理数据 X, y, classes = load_THUCNews_title_label() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7322) num_classes = len(classes) # 转化成字id print("tokenize...") tokenizer = Tokenizer(mintf=32, cutword=False) tokenizer.fit(X_train) # maxlen = find_best_maxlen(X_train, mode="max") maxlen = 48 def create_dataset(X, y, maxlen): X = tokenizer.transform(X) X = sequence.pad_sequences(X, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0.0) y = tf.keras.utils.to_categorical(y) return X, y