def get_dataloaders(
    data_dir,
    task_name="MultiRC",
    splits=["train", "valid", "test"],
    max_data_samples=None,
    max_sequence_length=256,
    tokenizer_name="bert-base-uncased",
    batch_size=16,
):
    """Load data and return dataloaders"""

    dataloaders = []

    tokenizer = get_tokenizer(tokenizer_name)

    for split in splits:
        dataset = get_dataset(data_dir, task_name, split, tokenizer,
                              max_data_samples, max_sequence_length)
        dataloader = MultitaskDataLoader(
            task_to_label_dict={task_name: "labels"},
            dataset=dataset,
            split=split,
            batch_size=batch_size,
            shuffle=(split == "train"),
        )
        dataloaders.append(dataloader)

        logger.info(
            f"Loaded {split} for {task_name} with {len(dataset)} samples.")

    return dataloaders
Beispiel #2
0
    def __init__(self, config):
        super(MyCustomCallback, self).__init__()
        self.config = config
        df = pd.read_csv(os.path.join(self.config.data_path, 'sentiments.csv'))
        self.bot = ExamAlarmBot()

        self.pred_sentences = [df.sentence[i] for i in range(10, 20)]
        self.pred_sentiments = [df.sentiments[i] for i in range(10, 20)]

        self.tokenizer = get_tokenizer(self.config)
Beispiel #3
0
    def __init__(self, config, model_name="sentiments.h5"):
        self.config = config

        self.model_name = model_name
        self.tokenizer = get_tokenizer(self.config)
        with open(os.path.join(self.config.data_path, self.config.test_set), 'rb') as f:
            (self.test_x, self.test_y) = pickle.load(f)

        adapter_size = None # use None to fine-tune all of BERT
        self.model = create_model(self.config, adapter_size=adapter_size)
        self.model.load_weights(os.path.join(self.config.epoch_model_path, model_name))
Beispiel #4
0
def get_dataloaders(
    data_dir,
    task_name="MultiRC",
    splits=["train", "val", "test"],
    max_data_samples=None,
    max_sequence_length=256,
    tokenizer_name="xlnet-base-cased",
    batch_size=16,
    augment=False,
    uid="uids",
):
    """Load data and return dataloaders"""

    dataloaders = []

    tokenizer = get_tokenizer(tokenizer_name)

    for split in splits:
        jsonl_path = os.path.join(
            data_dir, task_name,
            SuperGLUE_TASK_SPLIT_MAPPING[task_name][split])
        dataset = parsers.parser[task_name](jsonl_path, tokenizer, uid,
                                            max_data_samples,
                                            max_sequence_length)
        dataloader = EmmentalDataLoader(
            task_to_label_dict={task_name: "labels"},
            dataset=dataset,
            split=split,
            batch_size=batch_size,
            shuffle=(split == "train"),
        )
        dataloaders.append(dataloader)

        if (augment and split == "train"
                and task_name in augmentation.augmentation_funcs):
            augmentation_funcs = augmentation.augmentation_funcs[task_name]
            for af in augmentation_funcs:
                dataset = af(dataset)
                dataloader = EmmentalDataLoader(
                    task_to_label_dict={task_name: "labels"},
                    dataset=dataset,
                    split=split,
                    batch_size=batch_size,
                    shuffle=(split == "train"),
                )
                dataloaders.append(dataloader)

        logger.info(
            f"Loaded {split} for {task_name} with {len(dataset)} samples.")

    return dataloaders
Beispiel #5
0
def preprocess(sentences, tokenize_method, unit):
    '''
    This function does the following for each sentence
    1. Tokenize per method and unit
    2. Padding
    '''
    tokenize_func = tokenizer.get_tokenizer(tokenize_method, unit)
    tokenized_sentences = [
        tokenize_func(sentence)
        for sentence in tqdm(sentences, desc='Tokenizing')
    ]
    max_tokens, padded_sentences = pad(tokenized_sentences)

    return padded_sentences
Beispiel #6
0
    def __init__(self, args):

        if args.use_gpu and torch.cuda.is_available():
            cudnn.benchmark = True
            torch.manual_seed(args.seed)
            torch.cuda.manual_seed_all(args.seed)
            use_gpu = True
        else:
            print("Currently using CPU, however, GPU is highly recommended")
            use_gpu = False

        self.args = args
        self.device = torch.device('cuda' if use_gpu else 'cpu')
        self.model = None

        tokenizer = get_tokenizer(args)

        if args.model_name == 'atae_lstm':

            self.model = MYATAE_LSTM(tokenizer.embeddings,
                                     embed_dim=tokenizer.embed_dim,
                                     hidden_dim=args.hidden_dim,
                                     device=self.device)
            args.inputs_cols = ['text_raw_indices', 'aspect_indices']
        else:
            raise ValueError('invalid model')

        # data loader
        if args.train:
            train_dataset = SEResDataset(data_path=args.train_file,
                                         tokenizer=tokenizer)
            self.trainloader = DataLoader(dataset=train_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=True)

        test_dataset = SEResDataset(data_path=args.test_file,
                                    tokenizer=tokenizer)
        self.testloader = DataLoader(dataset=test_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=False)

        if use_gpu:
            self.model = nn.DataParallel(self.model).cuda()

        self.log_writer = SummaryWriter(comment=args.pretrain_type)
        # self.log_writer.add_graph(self.model, torch.LongTensor(2, 2, 5).random_(0, 10))

        if args.load_dir and os.path.exists(args.load_dir):
            checkpoint = torch.load(args.load_dir)
            self.model.load_state_dict(checkpoint['model'])
Beispiel #7
0
def tokenize_from_tsv(tokenizer_name: str,
                      input_path: str,
                      output_path: str,
                      y_index: int = 0,
                      x_index: int = 1,
                      y_header: str = "label",
                      x_header: str = "text") -> None:
    """
    Tokenizing on input_path file and saving to output_path file

    Args:
        
    """

    tokenizer = get_tokenizer(tokenizer_name)
    df = pd.read_csv(input_path, header=0, sep="\t")
    total = len(df)
    print(">> Strart Tokenizing This File Like Below...")
    print(df.head(-10))

    with open(output_path, "w", encoding="utf-8") as f1:
        f1.writelines(y_header + "\t" + x_header + "\n")
        row_iterator = df.iterrows()

        for index, row in tqdm(row_iterator, total=total):
            sentence = row[x_index]
            label = row[y_index]

            if is_nan(sentence) or is_nan(label):
                continue
            replaced = label.replace(" ", "_")
            sentence = sentence.replace("\n", "").strip()

            tokens = tokenizer(sentence)
            tokenized_sent = " ".join(_post_processing(tokens))
            if is_nan(tokens) or tokens == "":
                continue

            f1.writelines(replaced + "\t" + tokenized_sent + "\n")
    f1.close()
Beispiel #8
0
    print(e)

from    tokenizer import get_tokenizer
from    transformer import Transformer
from    utils import CustomSchedule, create_masks
from    test import Translator




BUFFER_SIZE = 20000
BATCH_SIZE = 64
MAX_SEQ_LENGTH = 128

train_dataset, val_dataset, tokenizer_en, tokenizer_zh = \
    get_tokenizer(MAX_SEQ_LENGTH, BATCH_SIZE)

# Chinese -> English translation
input_vocab_size = 21128
target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1
num_layers=4
d_model=512
dff=2048
num_heads=8

transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, dropout_rate)

inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
tar_inp = tf.random.uniform((BATCH_SIZE, MAX_SEQ_LENGTH))
Beispiel #9
0
from flask import Flask, render_template
import preprocessor
from preprocessor import preprocess_text
import model
from model import get_model
import tokenizer
from tokenizer import get_tokenizer
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from mappings import map_emoji

tokenizer = get_tokenizer()
model = get_model()

app = Flask(__name__)


@app.route('/<string:text>')
def index(text):
    text_processed = preprocess_text(text)
    df_test = pd.DataFrame({'Text': [text_processed]})
    X_predict = tokenizer.texts_to_sequences(df_test['Text'].values)
    print("Here")
    X_predict = pad_sequences(X_predict)
    pred = model.predict(X_predict)
    pred_labels = np.argmax(pred, axis=1)
    pred_emojis = map_emoji(pred_labels)
    return pred_emojis[0]

import tensorflow as tf
from tokenizer import get_tokenizer
import matplotlib.pyplot as plt
from attention_model import Transformer
import time

MAX_LENGTH = 40
BATCH_SIZE = 64

gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

train_dataset, tokenizer_zh = get_tokenizer(MAX_LENGTH, BATCH_SIZE)
sentences_ask = "我觉得你可能在开玩笑"
sentences_ans = '但是我并不这么认为。'
encode_id = tokenizer_zh.convert_tokens_to_ids(sentences_ask)
print('encode_id', encode_id)
print(tokenizer_zh.convert_ids_to_tokens(encode_id))

encode_id_2 = tokenizer_zh.convert_tokens_to_ids(sentences_ans)
print('encode_id', encode_id_2)
print(tokenizer_zh.convert_ids_to_tokens(encode_id_2))


def mask_padding_token(seq):
    seq = tf.cast(tf.equal(seq, 0), dtype=tf.float32)
    # padding to the (batch, 1, 1, seq_length)
    return seq[:, tf.newaxis, tf.newaxis, :]