Ejemplo n.º 1
0
 def __init__(self, cf, mode='train', transform = None):
     # mode: 'train' or 'test'            
     self.cf = cf
     self.mode = mode        
     self.transform = transform      
     if mode=='train': # For some reason, this has to be done this way..train = True, then, test=True!
         self.data = imdb_dataset(directory=cf.folder_of_data+'/imdb/', train = True)
         self.clean_all_text() # the clean text will replace the original one in self.data, it is necessary to do it here, as it the text might be used to build the w2v models        
         self.load_w2v_models()
     
     else:
         self.data = imdb_dataset(directory= cf.folder_of_data+'/imdb/', test=True)
         self.clean_all_text()                     
Ejemplo n.º 2
0
def preprocess_imdb(train_size: int = 1000, test_size: int = 100) -> dict:
    train_data, test_data = imdb_dataset(train=True, test=True)
    random.shuffle(train_data)
    random.shuffle(test_data)
    train_data = train_data[:train_size]
    test_data = test_data[:test_size]

    train_texts, test_texts = (
        [d["text"] for d in data] for data in (train_data, test_data)
    )

    train_labels, test_labels = (
        [d["sentiment"] for d in data] for data in (train_data, test_data)
    )

    train_tokens, train_tokens_ids = tokenize(train_texts)
    test_tokens, test_tokens_ids = tokenize(test_texts)

    train_y, test_y = (
        np.array(labels) == "pos" for labels in (train_labels, test_labels)
    )

    return {
        "test_labels": test_labels,
        "test_texts": test_texts,
        "test_tokens": test_tokens,
        "test_tokens_ids": test_tokens_ids,
        "test_y": test_y,
        "train_labels": train_labels,
        "train_texts": train_texts,
        "train_tokens": train_tokens,
        "train_tokens_ids": train_tokens_ids,
        "train_y": train_y,
    }
Ejemplo n.º 3
0
def test_imdb_dataset_row(mock_urlretrieve):
    mock_urlretrieve.side_effect = urlretrieve_side_effect

    # Check a row are parsed correctly
    train, test = imdb_dataset(directory=directory, test=True, train=True)
    assert len(train) > 0
    assert len(test) > 0
    assert test[0] == {
        'text':
        "My boyfriend and I went to watch The Guardian.At first I didn't want to watch it, "
        +
        "but I loved the movie- It was definitely the best movie I have seen in sometime."
        +
        "They portrayed the USCG very well, it really showed me what they do and I think "
        +
        "they should really be appreciated more.Not only did it teach but it was a really "
        +
        "good movie. The movie shows what the really do and how hard the job is.I think "
        +
        "being a USCG would be challenging and very scary. It was a great movie all around. "
        +
        "I would suggest this movie for anyone to see.The ending broke my heart but I know "
        +
        "why he did it. The storyline was great I give it 2 thumbs up. I cried it was very "
        + "emotional, I would give it a 20 if I could!",
        'sentiment':
        'pos'
    }

    # Clean up
    shutil.rmtree(os.path.join(directory, 'aclImdb'))
Ejemplo n.º 4
0
 def load_save_docs(cls, out_dir):
     train = imdb_dataset(train=True)
     test = imdb_dataset(test=True)
     train_ = []
     test_ = []
     for td in train:
         sent = normalize_str(td['text'])
         tup = (sent, LABELS[td['sentiment']])
         train_.append(tup)
     for td in test:
         sent = normalize_str(td['text'])
         tup = (sent, LABELS[td['sentiment']])
         test_.append(tup)
     ds = IMDBData()
     ds.build(train_, test_)
     ds.save(out_dir)
Ejemplo n.º 5
0
def readLang(dataset_title):
    """
    Args:
        dataset_title: either 'imdb' or 'ptb'
    """
    print("Reading lines...")
    if dataset_title == 'imdb':
        train = imdb_dataset(train=True, directory='../data/')
        # Read the dataset and split into lines
        lines = [train[ind]['text'].strip() for ind, doc in enumerate(train)]
        # Normalize lines
        lines = [
            ' '.join(["SOSTOKEN", normalizeString(s), "EOSTOKEN"])
            for s in lines
        ]
        lang = Lang(dataset_title)
    elif dataset_title == 'ptb':
        raise NotImplementedError
    return lang, lines
Ejemplo n.º 6
0
def imdb_to_df(is_train, label_to_idx):

    dset = imdb_dataset(train=is_train, test=not is_train)

    # create one hot encoding of labels
    num_labels = len(label_to_idx)
    all_labels = np.zeros((len(dset.rows), num_labels))
    all_label_indices = [[label_to_idx[row["sentiment"]]] for row in dset.rows]

    for i, labs in enumerate(all_label_indices):
        # binary encode the labels
        all_labels[i][labs] = 1
    all_labels = all_labels.astype(int)

    cols = ["text"]
    label_cols = ["topic_{}".format(lab) for lab in label_to_idx.keys()]
    cols.extend(label_cols)
    df = pd.DataFrame(columns=cols)
    df["text"] = [row["text"] for row in dset.rows]

    df[label_cols] = all_labels

    return df
Ejemplo n.º 7
0
def test_imdb_dataset_row(mock_urlretrieve):
    mock_urlretrieve.side_effect = urlretrieve_side_effect

    # Check a row are parsed correctly
    train, test = imdb_dataset(directory=directory, test=True, train=True)
    assert len(train) > 0
    assert len(test) > 0
    test = sorted(test, key=lambda r: len(r['text']))
    assert test[0] == {
        'text':
            "This movie was sadly under-promoted but proved to be truly exceptional. Entering " +
            "the theatre I knew nothing about the film except that a friend wanted to see it." +
            "<br /><br />I was caught off guard with the high quality of the film. I couldn't " +
            "image Ashton Kutcher in a serious role, but his performance truly exemplified his " +
            "character. This movie is exceptional and deserves our monetary support, unlike so " +
            "many other movies. It does not come lightly for me to recommend any movie, but in " +
            "this case I highly recommend that everyone see it.<br /><br />This films is Truly " +
            "Exceptional!",
        'sentiment':
            'pos'
    }

    # Clean up
    shutil.rmtree(os.path.join(directory, 'aclImdb'))
Ejemplo n.º 8
0
 def __init__(self, is_train: bool, tokenizer):
     super(ImdbDataset).__init__()
     self.tokenizer = tokenizer
     self.data = imdb_dataset(train=is_train, test=not is_train)
Ejemplo n.º 9
0
from pytorch_pretrained_bert import BertModel
from torch import nn
from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

train_texts, train_labels = list(
    zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(
    zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
Ejemplo n.º 10
0
def main():
    train_test = ('imdb', 'tomato') if TRAIN_IMDB else ('tomato', 'imdb')
    print("====training on {} and testing on {}======".format(*train_test))
    """read imdb dataset"""
    train_data, test_data = imdb_dataset(train=True, test=True)
    if not FULL:
        train_valid_data = random.sample(train_data, 10000)
        train_data, valid_data = train_valid_data[:8000], train_valid_data[8000:]
        test_data = random.sample(test_data, 2000)
    else:
        test_valid_data = random.sample(test_data, 4000)
        valid_data, test_data = test_valid_data[:2000], test_valid_data[2000:]

    train_dataset, valid_dataset, test_dataset = SentDataset(train_data), SentDataset(valid_data), SentDataset(test_data)
    trainIteration = data.DataLoader(dataset=train_dataset, collate_fn=sort_batch, batch_size=50, shuffle=True)
    validIteration = data.DataLoader(dataset=valid_dataset, collate_fn=sort_batch, batch_size=50)
    testIteration = data.DataLoader(dataset=test_dataset, collate_fn=sort_batch, batch_size=50)
    
    """read tomato dataset"""
    with open("../data/rotten_tomato_train.json", "r") as read_file:
        tomato_train_data = json.load(read_file)
    with open("../data/rotten_tomato_dev.json", "r") as read_file:
        tomato_valid_data = json.load(read_file)
    with open("../data/rotten_tomato_test.json", "r") as read_file:
        tomato_test_data = json.load(read_file)
    tomato_train_dataset, tomato_valid_dataset, tomato_test_dataset = SentDataset(tomato_train_data), SentDataset(tomato_valid_data), SentDataset(tomato_test_data)
    tomato_trainIteration = data.DataLoader(dataset=tomato_train_dataset, collate_fn=sort_batch, batch_size=50, shuffle=True)
    tomato_validIteration = data.DataLoader(dataset=tomato_valid_dataset, collate_fn=sort_batch, batch_size=50)
    tomato_testIteration = data.DataLoader(dataset=tomato_test_dataset, collate_fn=sort_batch, batch_size=50)

    """create model"""
    model = BERT_biLSTM(HIDDEN_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
    model = model.to(device)
    if device == 'cuda': model = nn.DataParallel(model)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()
    criterion = criterion.to(device)

    best_valid_loss = float('inf')

    """start training"""
    for epoch in range(N_EPOCHS):
        start_time = time.time()

        train_loss, train_acc = train(model, trainIteration if TRAIN_IMDB else tomato_trainIteration, optimizer, criterion, epoch)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        valid_loss, valid_acc = evaluate(model, validIteration if TRAIN_IMDB else tomato_validIteration, criterion)
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            if SAVE: torch.save(model.state_dict(), '../save/robust_BERT_model_{}.pt'.format('imdb' if TRAIN_IMDB else 'tomato'))

        print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
        
    """start testing on other dataset"""
    print("=====test result on own dataset=====")
    test_loss, test_acc = evaluate(model, testIteration if TRAIN_IMDB else tomato_testIteration, criterion)
    print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc * 100:.2f}%')
    print()
    print("=====test result on other dataset=====")
    test_loss, test_acc = evaluate(model, tomato_testIteration if TRAIN_IMDB else testIteration, criterion)
    print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc * 100:.2f}%')
Ejemplo n.º 11
0
def training(batch_size, epoch_size, filename):
    '''very unethical way of loading and traing the data in same function'''
    pd.set_option('display.max_columns', None)
    train_data, test_data = imdb_dataset(train=True, test=True)
    df = pd.read_csv("./data/fake.csv")
    df = df[['text', 'type']]
    #print(len(df))

    #print(Counter(df['type'].values))

    df = df[df['type'].isin(['fake', 'satire'])]
    df.dropna(inplace=True)
    df_fake = df[df['type'] == 'fake']
    df_statire = df[df['type'] == 'satire']
    df_statire = df_statire.sample(n=len(df_fake))
    df = df_statire.append(df_fake)
    df = df.sample(frac=1, random_state=24).reset_index(drop=True)

    #print(Counter(df['type'].values))

    train_data = df.head(19)
    test_data = df.tail(19)

    #print(train_data)
    train_data = [{
        'text': text,
        'type': type_data
    } for text in list(train_data['text'])
                  for type_data in list(train_data['type'])]
    test_data = [{
        'text': text,
        'type': type_data
    } for text in list(test_data['text'])
                 for type_data in list(test_data['type'])]

    train_texts, train_labels = list(
        zip(*map(lambda d: (d['text'], d['type']), train_data)))
    test_texts, test_labels = list(
        zip(*map(lambda d: (d['text'], d['type']), test_data)))

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    train_tokens = list(
        map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
    test_tokens = list(
        map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

    train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
    test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

    train_tokens_ids = pad_sequences(train_tokens_ids,
                                     maxlen=512,
                                     truncating="post",
                                     padding="post",
                                     dtype="int")
    test_tokens_ids = pad_sequences(test_tokens_ids,
                                    maxlen=512,
                                    truncating="post",
                                    padding="post",
                                    dtype="int")

    train_y = np.array(train_labels) == 'fake'
    test_y = np.array(test_labels) == 'fake'

    BATCH_SIZE = batch_size
    EPOCHS = epoch_size

    train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
    test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
    train_masks_tensor = torch.tensor(train_masks)
    test_masks_tensor = torch.tensor(test_masks)

    train_tokens_tensor = torch.tensor(train_tokens_ids)
    train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
    test_tokens_tensor = torch.tensor(test_tokens_ids)
    test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()
    train_dataset = torch.utils.data.TensorDataset(train_tokens_tensor,
                                                   train_masks_tensor,
                                                   train_y_tensor)
    train_sampler = torch.utils.data.RandomSampler(train_dataset)
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   sampler=train_sampler,
                                                   batch_size=BATCH_SIZE)
    test_dataset = torch.utils.data.TensorDataset(test_tokens_tensor,
                                                  test_masks_tensor,
                                                  test_y_tensor)
    test_sampler = torch.utils.data.SequentialSampler(test_dataset)
    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                                  sampler=test_sampler,
                                                  batch_size=BATCH_SIZE)

    bert_clf = BertBinaryClassifier()
    optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

    for epoch_num in range(EPOCHS):
        bert_clf.train()
        train_loss = 0
        for step_num, batch_data in enumerate(train_dataloader):
            token_ids, masks, labels = tuple(t for t in batch_data)
            probas = bert_clf(token_ids, masks)
            loss_func = nn.BCELoss()
            batch_loss = loss_func(probas, labels)
            train_loss += batch_loss.item()
            bert_clf.zero_grad()
            batch_loss.backward()
            optimizer.step()
            print('Epoch: ', epoch_num + 1)
            print("\r" +
                  "{0}/{1} loss: {2} ".format(step_num,
                                              len(train_data) /
                                              BATCH_SIZE, train_loss /
                                              (step_num + 1)))

    torch.save(bert_clf, filename)
    return
Ejemplo n.º 12
0
from slp.plbind.module import RnnPLModule
from slp.plbind.trainer import make_trainer, watch_model
from slp.util.log import configure_logging

MAX_LENGTH = 1024
collate_fn = SequenceClassificationCollator(device="cpu", max_length=MAX_LENGTH)
# collate_fn = SequenceClassificationCollator(device="cpu")


if __name__ == "__main__":
    pl.utilities.seed.seed_everything(seed=42)
    EXPERIMENT_NAME = "imdb-words-sentiment-classification"

    configure_logging(f"logs/{EXPERIMENT_NAME}")

    train, test = imdb_dataset(directory="./data/", train=True, test=True)

    raw_train = [d["text"] for d in train]
    labels_train = [d["sentiment"] for d in train]

    raw_test = [d["text"] for d in test]
    labels_test = [d["sentiment"] for d in test]

    ldm = PLDataModuleFromCorpus(
        raw_train,
        labels_train,
        test=raw_test,
        test_labels=labels_test,
        batch_size=64,
        batch_size_eval=32,
        collate_fn=collate_fn,
Ejemplo n.º 13
0
    tokenizer = SpacyTokenizer()
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device='cpu')

    def create_dataloader(d):
        d = (DatasetWrapper(d).map(tokenizer).map(to_token_ids).map(to_tensor))
        return DataLoader(d,
                          batch_size=32,
                          num_workers=1,
                          pin_memory=True,
                          shuffle=True,
                          collate_fn=collate_fn)

    train_loader, dev_loader = map(
        create_dataloader,
        imdb_dataset(directory='../data/', train=True, test=True))

    model = Classifier(
        WordRNN(256,
                embeddings,
                bidirectional=True,
                merge_bi='cat',
                packed_sequence=True,
                attention=True,
                device=DEVICE), 512, 3)

    optimizer = Adam([p for p in model.parameters() if p.requires_grad],
                     lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    metrics = {'accuracy': Accuracy(), 'loss': Loss(criterion)}
    trainer = SequentialTrainer(
Ejemplo n.º 14
0
#!/usr/bin/python
import numpy as np
import xgboost as xgb
import pandas as pd

from torchnlp.datasets import imdb_dataset

# Load the imdb training dataset
train = imdb_dataset(train=True)
train[0]  # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'}
Ejemplo n.º 15
0
def prepare_data_bert(batch_size):
    """:returns train and test loader for the IMDB dataset formatted correctly for BERT, each item in the dataset is in
    the form (token_ids, masks, labels)"""
    print('Loading IMDB data...')

    train_data, test_data = imdb_dataset(train=True, test=True)
    rn.shuffle(train_data)
    rn.shuffle(test_data)
    train_data = train_data[:1000]
    test_data = test_data[:100]

    train_texts, train_labels = list(
        zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
    test_texts, test_labels = list(
        zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

    print('Tokenizing for BERT')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)

    train_tokens = list(
        map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'],
            train_texts))
    test_tokens = list(
        map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'],
            test_texts))
    print(list(map(tokenizer.convert_tokens_to_ids, train_tokens))[0])
    train_tokens_ids = pad_sequences(list(
        map(tokenizer.convert_tokens_to_ids, train_tokens)),
                                     maxlen=512,
                                     truncating='post',
                                     padding='post',
                                     dtype='int')
    # print(train_tokens_ids[0])
    test_tokens_ids = pad_sequences(list(
        map(tokenizer.convert_tokens_to_ids, test_tokens)),
                                    maxlen=512,
                                    truncating='post',
                                    padding='post',
                                    dtype='int')

    train_y = np.array(np.array(train_labels) == 'pos', dtype=np.uint8)
    test_y = np.array(np.array(test_labels) == 'pos', dtype=np.uint8)
    train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

    train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
    test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

    train_tokens_tensor = torch.tensor(train_tokens_ids)
    train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

    test_tokens_tensor = torch.tensor(test_tokens_ids)
    test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

    train_masks_tensor = torch.tensor(train_masks)
    test_masks_tensor = torch.tensor(test_masks)

    train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor,
                                  train_y_tensor)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=batch_size)

    test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor,
                                 test_y_tensor)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 sampler=test_sampler,
                                 batch_size=batch_size)

    return train_dataloader, test_dataloader
Ejemplo n.º 16
0
    def __init__(self,
                 root: str,
                 normal_class=0,
                 tokenizer='spacy',
                 use_tfidf_weights=False,
                 append_sos=False,
                 append_eos=False,
                 clean_txt=False,
                 max_seq_len_prior=None):
        super().__init__(root)

        self.n_classes = 2  # 0: normal, 1: outlier
        classes = ['pos', 'neg']

        if normal_class == -1:
            self.normal_classes = classes
            self.outlier_classes = []
        else:
            self.normal_classes = [classes[normal_class]]
            del classes[normal_class]
            self.outlier_classes = classes

        if root not in nltk.data.path:
            nltk.data.path.append(root)
        # Load the imdb dataset
        self.train_set, self.test_set = imdb_dataset(directory=root,
                                                     train=True,
                                                     test=True)

        # Pre-process
        self.train_set.columns.add('index')
        self.test_set.columns.add('index')
        self.train_set.columns.remove('sentiment')
        self.test_set.columns.remove('sentiment')
        self.train_set.columns.add('label')
        self.test_set.columns.add('label')
        self.train_set.columns.add('weight')
        self.test_set.columns.add('weight')

        train_idx_normal = []  # for subsetting train_set to normal class
        for i, row in enumerate(self.train_set):
            row['label'] = row.pop('sentiment')
            if row['label'] in self.normal_classes:
                train_idx_normal.append(i)
                row['label'] = torch.tensor(0)
            else:
                row['label'] = torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        test_n_idx = []  # subsetting test_set to selected normal classes
        test_a_idx = []  # subsetting test_set to selected anomalous classes
        for i, row in enumerate(self.test_set):
            row['label'] = row.pop('sentiment')
            if row['label'] in self.normal_classes:
                test_n_idx.append(i)
            else:
                test_a_idx.append(i)
            row['label'] = torch.tensor(
                0) if row['label'] in self.normal_classes else torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        # Subset train_set to normal class
        self.train_set = Subset(self.train_set, train_idx_normal)
        # Subset test_set to selected normal classes
        self.test_n_set = Subset(self.test_set, test_n_idx)
        # Subset test_set to selected anomalous classes
        self.test_a_set = Subset(self.test_set, test_a_idx)

        # Make corpus and set encoder
        text_corpus = [
            row['text']
            for row in datasets_iterator(self.train_set, self.test_set)
        ]
        if tokenizer == 'spacy':
            self.encoder = SpacyEncoder(text_corpus,
                                        min_occurrences=3,
                                        append_eos=append_eos)
        if tokenizer == 'bert':
            self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased',
                                                           cache_dir=root)

        # Encode
        self.max_seq_len = 0
        for row in datasets_iterator(self.train_set, self.test_set):
            if append_sos:
                sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN]
                row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0),
                                         self.encoder.encode(row['text'])))
            else:
                row['text'] = self.encoder.encode(row['text'])
            if len(row['text']) > self.max_seq_len:
                self.max_seq_len = len(row['text'])

        # Compute tf-idf weights
        if use_tfidf_weights:
            compute_tfidf_weights(self.train_set,
                                  self.test_set,
                                  vocab_size=self.encoder.vocab_size)
        else:
            for row in datasets_iterator(self.train_set, self.test_set):
                row['weight'] = torch.empty(0)

        # Get indices after pre-processing
        for i, row in enumerate(self.train_set):
            row['index'] = i
        for i, row in enumerate(self.test_set):
            row['index'] = i

        # length prior
        sent_lengths = [len(row['text']) for row in self.train_set]
        sent_lengths_freq = np.bincount(np.array(sent_lengths))
        sent_lengths_freq = np.concatenate(
            (sent_lengths_freq,
             np.array((max_seq_len_prior - max(sent_lengths)) * [0])),
            axis=0)
        sent_lengths_freq = sent_lengths_freq + 1
        self.length_prior = np.log(sent_lengths_freq) - np.log(
            sent_lengths_freq.sum())
Ejemplo n.º 17
0
train = smt_dataset(train=True, fine_grained=True)
valid = smt_dataset(dev=True, fine_grained=True)
test = smt_dataset(test=True, fine_grained=True)

train_labels = create_SMT_labels(train, len(train))
train_text = np.array(train.__getitem__('text'))
valid_labels = create_SMT_labels(valid, len(valid))
valid_text = np.array(valid.__getitem__('text'))
test_labels = create_SMT_labels(test, len(test))
test_text = np.array(test.__getitem__('text'))

np.save('sst_train_text', train_text)
np.save('sst_train_labels', train_labels)
np.save('sst_valid_text', valid_text)
np.save('sst_valid_labels', valid_labels)
np.save('sst_test_text', test_text)
np.save('sst_test_labels', test_labels)

train = imdb_dataset(train=True)
test = imdb_dataset(test=True)

train_labels = create_IMDB_labels(train, len(train))
test_labels = create_IMDB_labels(test, len(test))
train_text = np.array(train.__getitem__('text'))
test_text = np.array(test.__getitem__('text'))

np.save('imdb_train_text', train_text)
np.save('imdb_train_labels', train_labels)
np.save('imdb_test_text', test_text)
np.save('imdb_test_labels', test_labels)
Ejemplo n.º 18
0
    def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False,
                 append_eos=False, clean_txt=False):
        super().__init__(root)

        self.n_classes = 2  # 0: normal, 1: outlier
        classes = ['pos', 'neg']

        if normal_class == -1:
            self.normal_classes = classes
            self.outlier_classes = []
        else:
            self.normal_classes = [classes[normal_class]]
            del classes[normal_class]
            self.outlier_classes = classes

        # Load the imdb dataset
        self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True)

        # Pre-process
        self.train_set.columns.add('index')
        self.test_set.columns.add('index')
        self.train_set.columns.remove('sentiment')
        self.test_set.columns.remove('sentiment')
        self.train_set.columns.add('label')
        self.test_set.columns.add('label')
        self.train_set.columns.add('weight')
        self.test_set.columns.add('weight')

        train_idx_normal = []  # for subsetting train_set to normal class
        for i, row in enumerate(self.train_set):
            row['label'] = row.pop('sentiment')
            if row['label'] in self.normal_classes:
                train_idx_normal.append(i)
                row['label'] = torch.tensor(0)
            else:
                row['label'] = torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        for i, row in enumerate(self.test_set):
            row['label'] = row.pop('sentiment')
            row['label'] = torch.tensor(0) if row['label'] in self.normal_classes else torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        # Subset train_set to normal class
        self.train_set = Subset(self.train_set, train_idx_normal)

        # Make corpus and set encoder
        text_corpus = [row['text'] for row in datasets_iterator(self.train_set, self.test_set)]
        if tokenizer == 'spacy':
            self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos)
        if tokenizer == 'bert':
            self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root)

        # Encode
        for row in datasets_iterator(self.train_set, self.test_set):
            if append_sos:
                sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN]
                row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text'])))
            else:
                row['text'] = self.encoder.encode(row['text'])

        # Compute tf-idf weights
        if use_tfidf_weights:
            compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size)
        else:
            for row in datasets_iterator(self.train_set, self.test_set):
                row['weight'] = torch.empty(0)

        # Get indices after pre-processing
        for i, row in enumerate(self.train_set):
            row['index'] = i
        for i, row in enumerate(self.test_set):
            row['index'] = i
Ejemplo n.º 19
0
NUM_CLASSES = 2
BATCH_SIZE = 100
LEARNING_RATE = 0.003

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if init == True:
    sentiment = {
        'pos': 1,
        'neg': 0,
    }

    train_texts = [
        text_to_word_sequence(data['text'])
        for data in tqdm(imdb_dataset(train=True))
    ]
    train_labels = [
        sentiment[data['sentiment']] for data in imdb_dataset(train=True)
    ]

    test_texts = [
        text_to_word_sequence(data['text'])
        for data in tqdm(imdb_dataset(test=True))
    ]
    test_labels = [
        sentiment[data['sentiment']] for data in imdb_dataset(test=True)
    ]

    # test = imdb_dataset(test=True)