def get_dataloaders( data_dir, task_name="MultiRC", splits=["train", "valid", "test"], max_data_samples=None, max_sequence_length=256, tokenizer_name="bert-base-uncased", batch_size=16, ): """Load data and return dataloaders""" dataloaders = [] tokenizer = get_tokenizer(tokenizer_name) for split in splits: dataset = get_dataset(data_dir, task_name, split, tokenizer, max_data_samples, max_sequence_length) dataloader = MultitaskDataLoader( task_to_label_dict={task_name: "labels"}, dataset=dataset, split=split, batch_size=batch_size, shuffle=(split == "train"), ) dataloaders.append(dataloader) logger.info( f"Loaded {split} for {task_name} with {len(dataset)} samples.") return dataloaders
def __init__(self, config): super(MyCustomCallback, self).__init__() self.config = config df = pd.read_csv(os.path.join(self.config.data_path, 'sentiments.csv')) self.bot = ExamAlarmBot() self.pred_sentences = [df.sentence[i] for i in range(10, 20)] self.pred_sentiments = [df.sentiments[i] for i in range(10, 20)] self.tokenizer = get_tokenizer(self.config)
def __init__(self, config, model_name="sentiments.h5"): self.config = config self.model_name = model_name self.tokenizer = get_tokenizer(self.config) with open(os.path.join(self.config.data_path, self.config.test_set), 'rb') as f: (self.test_x, self.test_y) = pickle.load(f) adapter_size = None # use None to fine-tune all of BERT self.model = create_model(self.config, adapter_size=adapter_size) self.model.load_weights(os.path.join(self.config.epoch_model_path, model_name))
def get_dataloaders( data_dir, task_name="MultiRC", splits=["train", "val", "test"], max_data_samples=None, max_sequence_length=256, tokenizer_name="xlnet-base-cased", batch_size=16, augment=False, uid="uids", ): """Load data and return dataloaders""" dataloaders = [] tokenizer = get_tokenizer(tokenizer_name) for split in splits: jsonl_path = os.path.join( data_dir, task_name, SuperGLUE_TASK_SPLIT_MAPPING[task_name][split]) dataset = parsers.parser[task_name](jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length) dataloader = EmmentalDataLoader( task_to_label_dict={task_name: "labels"}, dataset=dataset, split=split, batch_size=batch_size, shuffle=(split == "train"), ) dataloaders.append(dataloader) if (augment and split == "train" and task_name in augmentation.augmentation_funcs): augmentation_funcs = augmentation.augmentation_funcs[task_name] for af in augmentation_funcs: dataset = af(dataset) dataloader = EmmentalDataLoader( task_to_label_dict={task_name: "labels"}, dataset=dataset, split=split, batch_size=batch_size, shuffle=(split == "train"), ) dataloaders.append(dataloader) logger.info( f"Loaded {split} for {task_name} with {len(dataset)} samples.") return dataloaders
def preprocess(sentences, tokenize_method, unit): ''' This function does the following for each sentence 1. Tokenize per method and unit 2. Padding ''' tokenize_func = tokenizer.get_tokenizer(tokenize_method, unit) tokenized_sentences = [ tokenize_func(sentence) for sentence in tqdm(sentences, desc='Tokenizing') ] max_tokens, padded_sentences = pad(tokenized_sentences) return padded_sentences
def __init__(self, args): if args.use_gpu and torch.cuda.is_available(): cudnn.benchmark = True torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) use_gpu = True else: print("Currently using CPU, however, GPU is highly recommended") use_gpu = False self.args = args self.device = torch.device('cuda' if use_gpu else 'cpu') self.model = None tokenizer = get_tokenizer(args) if args.model_name == 'atae_lstm': self.model = MYATAE_LSTM(tokenizer.embeddings, embed_dim=tokenizer.embed_dim, hidden_dim=args.hidden_dim, device=self.device) args.inputs_cols = ['text_raw_indices', 'aspect_indices'] else: raise ValueError('invalid model') # data loader if args.train: train_dataset = SEResDataset(data_path=args.train_file, tokenizer=tokenizer) self.trainloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) test_dataset = SEResDataset(data_path=args.test_file, tokenizer=tokenizer) self.testloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False) if use_gpu: self.model = nn.DataParallel(self.model).cuda() self.log_writer = SummaryWriter(comment=args.pretrain_type) # self.log_writer.add_graph(self.model, torch.LongTensor(2, 2, 5).random_(0, 10)) if args.load_dir and os.path.exists(args.load_dir): checkpoint = torch.load(args.load_dir) self.model.load_state_dict(checkpoint['model'])
def tokenize_from_tsv(tokenizer_name: str, input_path: str, output_path: str, y_index: int = 0, x_index: int = 1, y_header: str = "label", x_header: str = "text") -> None: """ Tokenizing on input_path file and saving to output_path file Args: """ tokenizer = get_tokenizer(tokenizer_name) df = pd.read_csv(input_path, header=0, sep="\t") total = len(df) print(">> Strart Tokenizing This File Like Below...") print(df.head(-10)) with open(output_path, "w", encoding="utf-8") as f1: f1.writelines(y_header + "\t" + x_header + "\n") row_iterator = df.iterrows() for index, row in tqdm(row_iterator, total=total): sentence = row[x_index] label = row[y_index] if is_nan(sentence) or is_nan(label): continue replaced = label.replace(" ", "_") sentence = sentence.replace("\n", "").strip() tokens = tokenizer(sentence) tokenized_sent = " ".join(_post_processing(tokens)) if is_nan(tokens) or tokens == "": continue f1.writelines(replaced + "\t" + tokenized_sent + "\n") f1.close()
print(e) from tokenizer import get_tokenizer from transformer import Transformer from utils import CustomSchedule, create_masks from test import Translator BUFFER_SIZE = 20000 BATCH_SIZE = 64 MAX_SEQ_LENGTH = 128 train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \ get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE) # Chinese -> English translation input_vocab_size = 21128 target_vocab_size = tokenizer_en.vocab_size + 2 dropout_rate = 0.1 num_layers=4 d_model=512 dff=2048 num_heads=8 transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate) inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH)) tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
from flask import Flask, render_template import preprocessor from preprocessor import preprocess_text import model from model import get_model import tokenizer from tokenizer import get_tokenizer import pandas as pd import numpy as np from keras.preprocessing.sequence import pad_sequences from mappings import map_emoji tokenizer = get_tokenizer() model = get_model() app = Flask(__name__) @app.route('/<string:text>') def index(text): text_processed = preprocess_text(text) df_test = pd.DataFrame({'Text': [text_processed]}) X_predict = tokenizer.texts_to_sequences(df_test['Text'].values) print("Here") X_predict = pad_sequences(X_predict) pred = model.predict(X_predict) pred_labels = np.argmax(pred, axis=1) pred_emojis = map_emoji(pred_labels) return pred_emojis[0]
import tensorflow as tf from tokenizer import get_tokenizer import matplotlib.pyplot as plt from attention_model import Transformer import time MAX_LENGTH = 40 BATCH_SIZE = 64 gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) train_dataset, tokenizer_zh = get_tokenizer(MAX_LENGTH, BATCH_SIZE) sentences_ask = "我觉得你可能在开玩笑" sentences_ans = '但是我并不这么认为。' encode_id = tokenizer_zh.convert_tokens_to_ids(sentences_ask) print('encode_id', encode_id) print(tokenizer_zh.convert_ids_to_tokens(encode_id)) encode_id_2 = tokenizer_zh.convert_tokens_to_ids(sentences_ans) print('encode_id', encode_id_2) print(tokenizer_zh.convert_ids_to_tokens(encode_id_2)) def mask_padding_token(seq): seq = tf.cast(tf.equal(seq, 0), dtype=tf.float32) # padding to the (batch, 1, 1, seq_length) return seq[:, tf.newaxis, tf.newaxis, :]