device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


####################################
#         Hyper-parameters         #
####################################
BATCH_SIZE = 64
LEARNING_RATE = 1e-3


####################################
#          Preparing Data          #
####################################
# 1. data.Field()
TEXT = data.Field(tokenize='spacy', include_lengths=True)
LABELS = data.LabelField()

# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=dataset_path,
                                                   train="train.tsv",
                                                   test="test.tsv",
                                                   fields=[('labels', LABELS), ('text', TEXT)],
                                                   format="tsv")

# train_data, test_data = datasets.IMDB.splits(TEXT, LABELS)

print("Number of train_data = {}".format(len(train_data)))
print("Number of test_data = {}".format(len(test_data)))

print("vars(train_data[0]) = {}\n".format(vars(train_data[0])))
# convert neutral, positive and negative to numeric
# sentiment_map = {'neutral': 0, 'positive': 1, 'negative': -1}
# final_df['airline_sentiment'] = final_df['airline_sentiment'].map(sentiment_map)
# split into train, test, val (.7, .15, .15)
train_df, testval_df = train_test_split(final_df, test_size=0.3)
test_df, val_df = train_test_split(testval_df, test_size=0.5)

# convert df back to csv, with column names
train_df.to_csv(data_dir + '/train.csv', index=False)
test_df.to_csv(data_dir + '/test.csv', index=False)
val_df.to_csv(data_dir + '/val.csv', index=False)

# load into torchtext
ID = data.Field()
TEXT = data.Field(tokenize='spacy')
SENTIMENT = data.LabelField(dtype=torch.float)
AIRLINE = data.Field()

# access using batch.id, batch.text etc
fields = [('id', ID), ('text', TEXT), ('airline', AIRLINE),
          ('label', SENTIMENT)]
train_data, valid_data, test_data = data.TabularDataset.splits(
    path=data_dir,
    train='train.csv',
    validation='val.csv',
    test='test.csv',
    format='csv',
    fields=fields,
    skip_header=True)
# build iterators
MAX_VOCAB_SIZE = 10_000
from torchtext import datasets
import time
import random
import torch.nn as nn
import torch.optim as optim
import spacy
import sys
import pandas as pd

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
print('begin to load dataset')
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.getstate())

MAX_VOCAB_SIZE = 25_000
print('building vocab')
TEXT.build_vocab(train_data,
        max_size = MAX_VOCAB_SIZE,
        vectors = "glove.6B.100d",
        unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

BATCH_SIZE = 64
    return train_iter, dev_iter, test_iter






if __name__ == "__main__":

    data_dir = "/home/songyingxin/datasets/SST-2"

    CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True)
    char_field = data.NestedField(CHAR_NESTING, tokenize='spacy')
    word_field = data.Field(tokenize='spacy', lower=True,
                            include_lengths=True, fix_length=100)
    label_field = data.LabelField(dtype=torch.long)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    word_emb_file = "/home/songyingxin/datasets/WordEmbedding/glove/glove.840B.300d.txt"
    char_emb_file = "/home/songyingxin/datasets/WordEmbedding/glove/glove.840B.300d-char.txt"

    train_iter, dev_iter, test_iter = sst_word_char(
        data_dir, word_field, char_field, label_field, 32, device, word_emb_file, char_emb_file)
    
    for batch in train_iter:

        print(batch)


Example #5
0
def load_dataset(test_sen=None):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.

    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.

    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.

    """

    #    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(tensor_type=torch.cuda.FloatTensor)
    INDEX = data.Field(tensor_type=torch.cuda.LongTensor)

    TEXT = data.Field(sequential=True,
                      fix_length=20000,
                      tokenize=tokenizer,
                      pad_first=True,
                      tensor_type=torch.cuda.LongTensor,
                      lower=True,
                      batch_first=True)

    train_data, test_data = data.TabularDataset.splits(
        path='.',
        format='csv',
        skip_header=True,
        train='blogs_training.csv',
        validation='blogs_testing.csv',
        fields=[('index', None), ('text', TEXT), ('fileIndex', None),
                ('label', LABEL), ('age', None), ('industry', None),
                ('hscope', None)])

    #    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, vectors=GloVe(name='twitter.27B', dim=100))
    LABEL.build_vocab(train_data)

    pickle.dump(TEXT, open("TEXT.pickle", "wb"))

    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split(
    )  # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=32,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True)
    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Example #6
0
    'ninp': emsize,
    'nhid': nhid,
    'nlayers': nlayers,
    'dropout': dropout,
    'tie_weights': tied,
}

# TODO:
# Rewrite this entire thing: https://github.com/pytorch/text/issues/664
# We have to use the same numericalization as in the example before.
TEXT = data.Field(sequential=True,
                  include_lengths=True,
                  use_vocab=True,
                  tokenize=lambda x: tokenizer.encode(x).tokens)

LABELS = data.LabelField(dtype=torch.float, is_target=True)  # , is_target=True
NAMES = data.RawField(is_target=False)

# Fields are added by column left to write in the underlying table
fields = [('name', NAMES), ('label', LABELS), ('text', TEXT)]

train, dev, test = data.TabularDataset.splits(
    path=
    '/Users/phi/Dropbox/projects/picotext/journal/2020-05-23T1315/tmp/processed',
    format='CSV',
    fields=fields,
    train='train.csv',
    validation='dev.csv',
    test='test.csv')

TEXT.build_vocab()  # We'll fill this w/ the tokenizer
Example #7
0
def predict(csv1, csv2):

    train = csv1
    test = csv2

    #encoding='gb18030'

    #print(train.shape)

    print('Now loading and predicting........')

    train_df, valid_df = train_test_split(train)

    import spacy
    spacy_en = spacy.load("en_ner_bionlp13cg_md")

    def tokenizer(text):  # create a tokenizer function

        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = data.Field(tokenize=tokenizer, include_lengths=True)
    LABEL = data.LabelField(dtype=torch.float)

    class DataFrameDataset(data.Dataset):
        def __init__(self, df, fields, is_test=False, **kwargs):
            examples = []
            for i, row in df.iterrows():
                label = row.Label if not is_test else None
                text = row.TEXT
                examples.append(data.Example.fromlist([text, label], fields))

            super().__init__(examples, fields, **kwargs)

        @staticmethod
        def sort_key(ex):
            return len(ex.text)

        @classmethod
        def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
            train_data, val_data, test_data = (None, None, None)
            data_field = fields

            if train_df is not None:
                #print('do train')
                train_data = cls(train_df.copy(), data_field, **kwargs)
            if val_df is not None:
                #print('do valid')
                val_data = cls(val_df.copy(), data_field, **kwargs)
            if test_df is not None:
                #print('do test')
                test_data = cls(test_df.copy(), data_field, **kwargs)

            return tuple(d for d in (train_data, val_data, test_data)
                         if d is not None)

    fields = [('text', TEXT), ('label', LABEL)]

    train_ds, val_ds, test_ds = DataFrameDataset.splits(fields,
                                                        train_df=train_df,
                                                        val_df=valid_df,
                                                        test_df=test)

    MAX_VOCAB_SIZE = 10000

    TEXT.build_vocab(train_ds,
                     max_size=MAX_VOCAB_SIZE,
                     vectors='glove.6B.50d',
                     unk_init=torch.Tensor.zero_)

    LABEL.build_vocab(train_ds)

    BATCH_SIZE = 64 * 2

    device = 'cpu'

    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_ds, val_ds, test_ds),
        batch_size=BATCH_SIZE,
        sort_within_batch=True,
        device=device)

    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 50
    HIDDEN_DIM = 50
    OUTPUT_DIM = 1
    N_LAYERS = 2
    BIDIRECTIONAL = True

    DROPOUT = 0.1
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # padding

    class LSTM_net(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                     n_layers, bidirectional, dropout, pad_idx):

            super().__init__()

            self.embedding = nn.Embedding(vocab_size,
                                          embedding_dim,
                                          padding_idx=pad_idx)

            self.rnn = nn.LSTM(embedding_dim,
                               hidden_dim,
                               num_layers=n_layers,
                               bidirectional=bidirectional,
                               dropout=dropout)

            self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)

            self.fc2 = nn.Linear(hidden_dim, 1)

        def forward(self, text, text_lengths):

            embedded = self.embedding(text)

            packed_embedded = nn.utils.rnn.pack_padded_sequence(
                embedded, text_lengths)

            packed_output, (hidden, cell) = self.rnn(packed_embedded)

            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
            #hidden=hidden[-1,:,:]
            output = self.fc1(hidden)
            output = self.fc2(output)

            return output

    from sklearn.metrics import roc_auc_score

    def binary_accuracy(preds, y):
        """
        Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
        """

        rounded_preds = (torch.sigmoid(preds) > 0.41).float()

        correct = (
            rounded_preds == y).float()  #convert into float for division
        acc = correct.sum() / len(correct)

        return acc, rounded_preds, torch.sigmoid(preds)

    def evaluate(model, iterator):

        epoch_acc = 0

        model.eval()

        pred_collect = torch.empty(0)
        y_collect = torch.empty(0)
        y_prob = torch.empty(0)

        with torch.no_grad():
            for batch in iterator:
                text, text_lengths = batch.text
                predictions = model(text, text_lengths).squeeze(1)

                acc, pred_y, prob = binary_accuracy(predictions, batch.label)

                epoch_acc = acc.item() + epoch_acc
                pred_collect = torch.cat([pred_collect, pred_y])
                y_collect = torch.cat([y_collect, batch.label])
                y_prob = torch.cat([y_prob, prob])

        try:
            auc = roc_auc_score(y_collect.cpu().data.numpy(),
                                pred_collect.cpu().data.numpy())
        except:
            auc = 'UNAVAILABLE'
        return epoch_acc / len(iterator), auc, y_collect, y_prob, pred_collect

    model = LSTM_net(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
                     N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

    model.load_state_dict(torch.load('LSTM_MODEL', map_location='cpu'))

    a, b, my_lab, my_prob, my_pred = evaluate(model, test_iterator)
    return 'Yes, this patient might have readmission' if my_pred.data.numpy(
    ) else 'No, this patient might not have readmission'  #back to label class
Example #8
0
def test_model(test_data_dir):
    """ Use trained models to get the final prediction """
    pretrained_models = ['bert-base-uncased', 'xlnet-base-cased', 'roberta-base']
    # load testing data into pandas DataFrame
    with open(test_data_dir) as f:
        test_lines = [line.rstrip('\n')[line.rstrip('\n').find(',') + 1:] for line in f]

    test_df = pd.DataFrame(test_lines, columns=['text'])
    # because the model input required some label we won't use this though
    test_df['label'] = 1

    for pretrained_model in pretrained_models:
        # load model
        if pretrained_model == 'bert-base-uncased':
            from transformers import BertForSequenceClassification as SequenceClassificationModel
            selected_epochs = bert_picks
        elif pretrained_model == 'xlnet-base-cased':
            from transformers import XLNetForSequenceClassification as SequenceClassificationModel
            selected_epochs = xlnet_picks
        elif pretrained_model == 'roberta-base':
            from transformers import RobertaForSequenceClassification as SequenceClassificationModel
            selected_epochs = roberta_picks

        config = AutoConfig.from_pretrained(pretrained_model)
        model = SequenceClassificationModel(config)

        # load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        init_token_idx = tokenizer.cls_token_id
        eos_token_idx = tokenizer.sep_token_id
        pad_token_idx = tokenizer.pad_token_id
        unk_token_idx = tokenizer.unk_token_id

        max_input_length = tokenizer.max_model_input_sizes[pretrained_model]

        def tokenize_and_cut(sentence):
            """ Tokenize the sentence and cut it if it's too long """
            tokens = tokenizer.tokenize(sentence)
            # - 2 is for cls and sep tokens
            tokens = tokens[:max_input_length - 2]
            return tokens

        # xlnet model has no max_model_input_sizes field but it acutally has a limit
        # so we manually set it
        if max_input_length == None:
            max_input_length = 512

        # Field handles the conversion to Tensor (tokenizing)
        TEXT = data.Field(
            batch_first=True,
            use_vocab=False,
            tokenize=tokenize_and_cut,
            preprocessing=tokenizer.convert_tokens_to_ids,
            init_token=init_token_idx,
            eos_token=eos_token_idx,
            pad_token=pad_token_idx,
            unk_token=unk_token_idx
        )

        LABEL = data.LabelField(dtype=torch.long, use_vocab=False)

        # transform DataFrame into torchtext Dataset
        print('Transforming testing data for', pretrained_model, 'model')
        test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, test_df=test_df)

        BATCH_SIZE = 32
        # get gpu if possible
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        test_iterator = data.Iterator(test_data, batch_size=BATCH_SIZE, device=device, shuffle=False, sort=False, train=False)

        for selected_epoch in selected_epochs:
            # load trained model
            model.load_state_dict(
                torch.load(os.path.join(
                    'models',
                    f'{pretrained_model}-e{selected_epoch:02}-model.pt'
                ), map_location=device)
            )
            model = model.eval()

            # get predictions of test data
            print(f'Testing for {pretrained_model} epoch {selected_epoch}')
            predictions = test(model, test_iterator)

            # map predictions to match the original
            label_map = {0: -1, 1: 1}
            corrected_predictions = list(map(lambda x: label_map[x], predictions))

            # load data into dataframe
            submission = pd.read_csv('predictions_test/sample_submission.csv')
            submission.Prediction = corrected_predictions
            submission.to_csv(os.path.join('predictions_test', f'{pretrained_model}-e{selected_epoch:02}.csv'), index=False)

    test_predictions('predictions_test')
Example #9
0
def classify(tokenizerType):
    #Load dataset

    TEXT = data.Field(tokenize=tokenizerOptions[tokenizerType],
                      include_lengths=True,
                      lower=True)
    LABEL = data.LabelField(dtype=torch.float,
                            sequential=False,
                            use_vocab=False)
    fields = [('text', TEXT), ('label', LABEL)]
    train_data = data.TabularDataset(path='amazon_reviews.txt',
                                     format='tsv',
                                     fields=fields)

    #Split dataset into train, validation and test
    train_data, valid_data, test_data = train_data.split(
        split_ratio=[0.64, 0.2, 0.16], random_state=random.seed(SEED))

    #Build vocabulary using predefined vectors
    TEXT.build_vocab(train_data,
                     vectors="glove.6B.100d",
                     unk_init=torch.Tensor.normal_)
    LABEL.build_vocab(train_data)

    #print(TEXT.vocab.itos[:100])
    #Use GPU, if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #Create iterators to get data in batches
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        datasets=(train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device,
        sort_key=lambda x: len(x.text),
        sort=False,
        sort_within_batch=True)

    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 256
    OUTPUT_DIM = 1

    model = LSTM(vocab_size=INPUT_DIM,
                 embedding_dim=EMBEDDING_DIM,
                 hidden_dim=HIDDEN_DIM,
                 output_dim=OUTPUT_DIM,
                 n_layers=3,
                 bidirectional=True,
                 dropout=0.5,
                 pad_idx=TEXT.vocab.stoi[TEXT.pad_token])

    #Replace initial weights of embedding with pre-trained embedding
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    #Set UNK and PAD embeddings to zero
    model.embedding.weight.data[TEXT.vocab.stoi[TEXT.unk_token]] = torch.zeros(
        EMBEDDING_DIM)
    model.embedding.weight.data[TEXT.vocab.stoi[TEXT.pad_token]] = torch.zeros(
        EMBEDDING_DIM)

    #SGD optimizer and binary cross entropy loss
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()

    #Transfer model and criterion to GPU
    model = model.to(device)
    criterion = criterion.to(device)

    best_valid_loss = float('inf')
    train_loss_list = []
    valid_loss_list = []

    for epoch in range(N_EPOCHS):

        train_loss, train_acc = train(model, train_iterator, optimizer,
                                      criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best-model.pt')

        train_loss_list.append(train_loss)
        valid_loss_list.append(valid_loss)

    print(tokenizerType + ":")
    plotLoss(train_loss_list, valid_loss_list)

    model.load_state_dict(torch.load('best-model.pt'))

    test_loss, test_acc = evaluate(model, test_iterator, criterion)

    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    print("\n")
dev.to_csv(os.path.join(TEMP_DIRECTORY, DEV_FILE),
           header=True,
           sep='\t',
           index=False,
           encoding='utf-8')
test.to_csv(os.path.join(TEMP_DIRECTORY, TEST_FILE),
            header=True,
            sep='\t',
            index=False,
            encoding='utf-8')

id_variable = data.Field()
text_variable = data.Field(batch_first=True,
                           tokenize=pipeline,
                           fix_length=FIXED_LENGTH)
target_variable = data.LabelField(dtype=torch.float)

train_fields = [
    ('id', None),  # we dont need this, so no processing
    ('tweet', text_variable),  # process it as text
    ('subtask_a', None),  # process it as label
    ('encoded_subtask_a', target_variable)
]

dev_fields = [
    ('id', id_variable),  # we process this as id field
    ('tweet', text_variable),  # process it as text
    ('subtask_a', None),  # process it as label
    ('encoded_subtask_a', None)
]
def train_text_model(userName, projectName, projectType, numEpochs=10):

    S3_BUCKET_OUTPUT = 'gauravp-eva4-capstone-models'
    # Find number of users
    s3 = boto3.client('s3',
                      aws_access_key_id='aws_access_key_id',
                      aws_secret_access_key='aws_secre')

    print('Delete model files corresponding to current session')

    savedTokenizerName = f'{userName}_{projectName}_{projectType}.pkl'
    print(f'Deleting {savedTokenizerName}')
    s3.delete_object(Bucket=S3_BUCKET_OUTPUT, Key=savedTokenizerName)

    savedModelName = f'{userName}_{projectName}_{projectType}.pt'
    print(f'Deleting {savedModelName}')
    s3.delete_object(Bucket=S3_BUCKET_OUTPUT, Key=savedModelName)

    model_info_file_name = f'{userName}_{projectName}_{projectType}.json'
    print(f'Deleting {model_info_file_name}')
    s3.delete_object(Bucket=S3_BUCKET_OUTPUT, Key=model_info_file_name)

    print('Preparing train val splits')
    datasetPath = f'./user_data/{userName}/{projectName}/{projectType}/train_data'

    texts = []
    labels = []
    for dirName in os.listdir(datasetPath):
        dirPath = os.path.join(datasetPath, dirName)
        print(dirPath)
        #print(resizedDirPath)

        count = 0
        for fileName in os.listdir(dirPath):
            filePath = os.path.join(dirPath, fileName)
            #print(filePath)
            labelName = filePath.split('/')[-2]
            print('className: ', labelName, filePath)

            with open(filePath, newline='') as f:
                reader = csv.reader(f)
                row = next(reader)
                print(row)
                texts.append(row[0])
                labels.append(labelName)

    print(len(texts))
    print(len(labels))

    # Defining Fields
    # We are using spacy as a tokanizer
    dataset_text = data.Field(sequential=True,
                              tokenize='spacy',
                              batch_first=True,
                              include_lengths=True)
    dataset_label = data.LabelField(tokenize='spacy',
                                    is_target=True,
                                    batch_first=True,
                                    sequential=False)

    # Define names of dataset and its label
    fields = [('dataset_text', dataset_text), ('dataset_label', dataset_label)]

    # We will gather data into a list
    example = [
        data.Example.fromlist([texts[i], labels[i]], fields)
        for i in range(len(texts))
    ]

    # Define userDataset consisting of data from dataframe and fields defined by us
    userDataset = data.Dataset(example, fields)

    # split dataset into training and validation
    (train, valid) = userDataset.split(split_ratio=[0.70, 0.30])
    print((len(train), len(valid)))

    print(vars(train.examples[10]))

    # Build vacab for text data as well as text labels
    dataset_text.build_vocab(train)
    dataset_label.build_vocab(train)

    num_classes = len(dataset_label.vocab)

    print('Size of input vocab : ', len(dataset_text.vocab))
    print('Size of label vocab : ', len(dataset_label.vocab))
    print('Top 10 words appreared repeatedly :',
          list(dataset_text.vocab.freqs.most_common(10)))
    print('Labels : ', dataset_label.vocab.stoi)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_iterator, valid_iterator = data.BucketIterator.splits(
        (train, valid),
        batch_size=32,
        sort_key=lambda x: len(x.dataset_text),
        sort_within_batch=True,
        device=device)

    with open('tokenizer.pkl', 'wb') as tokens:
        pickle.dump(dataset_text.vocab.stoi, tokens)

    # Define hyperparameters
    size_of_vocab = len(dataset_text.vocab)
    embedding_dim = 300
    num_hidden_nodes = 100
    num_output_nodes = len(dataset_label.vocab)
    num_layers = 2
    dropout = 0.2

    # Instantiate the model
    model = classifier(size_of_vocab,
                       embedding_dim,
                       num_hidden_nodes,
                       num_output_nodes,
                       num_layers,
                       dropout=dropout)
    print(model)

    # No. of trianable parameters
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f'The model has {count_parameters(model):,} trainable parameters')

    import torch.optim as optim

    # define optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()

    # define metric
    def binary_accuracy(preds, y):
        # round predictions to the closest integer
        _, predictions = torch.max(preds, 1)

        correct = (predictions == y).float()
        acc = correct.sum() / len(correct)
        return acc

    # push to cuda if available
    model = model.to(device)
    criterion = criterion.to(device)

    # train loop
    def train(model, iterator, optimizer, criterion):
        # initialize every epoch
        epoch_loss = 0
        epoch_acc = 0

        # set the model in training phase
        model.train()

        for batch in iterator:
            # resets the gradients after every batch
            optimizer.zero_grad()

            # retrieve text and no. of words
            dataset_text, dataset_text_lengths = batch.dataset_text

            # convert to 1D tensor
            predictions = model(dataset_text, dataset_text_lengths).squeeze()

            # compute the loss
            loss = criterion(predictions, batch.dataset_label)

            # compute the binary accuracy
            acc = binary_accuracy(predictions, batch.dataset_label)

            # backpropage the loss and compute the gradients
            loss.backward()

            # update the weights
            optimizer.step()

            # loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    #evaluate loop
    def evaluate(model, iterator, criterion):
        # initialize every epoch
        epoch_loss = 0
        epoch_acc = 0

        # deactivating dropout layers
        model.eval()

        # deactivates autograd
        with torch.no_grad():
            for batch in iterator:
                # retrieve text and no. of words
                dataset_text, dataset_text_lengths = batch.dataset_text

                # convert to 1d tensor
                predictions = model(dataset_text,
                                    dataset_text_lengths).squeeze()

                # compute loss and accuracy
                loss = criterion(predictions, batch.dataset_label)
                acc = binary_accuracy(predictions, batch.dataset_label)

                # keep track of loss and accuracy
                epoch_loss += loss.item()
                epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    N_EPOCHS = numEpochs
    best_valid_loss = float('inf')
    best_valid_acc = 0.0
    best_train_acc = 0.0

    for epoch in range(N_EPOCHS):

        # train the model
        train_loss, train_acc = train(model, train_iterator, optimizer,
                                      criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), './saved_weights.pt')

        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc

        if train_acc > best_train_acc:
            best_train_acc = train_acc

        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}% \n'
        )

    model.load_state_dict(torch.load('./saved_weights.pt'))

    savedModelName = f'{userName}_{projectName}_{projectType}.pt'
    print("Saved Model Name", savedModelName)
    torch.save(model, savedModelName)

    savedTokenizerName = f'{userName}_{projectName}_{projectType}.pkl'
    os.rename('./tokenizer.pkl', savedTokenizerName)
    # prepare model information file
    model_info = {}
    model_info['numClasses'] = num_classes
    model_info['classNames'] = dataset_label.vocab.itos
    model_info['modelName'] = savedModelName
    model_info['userName'] = userName
    model_info['projectName'] = projectName
    model_info['bestTestAcc'] = best_valid_acc
    model_info['bestTrainAcc'] = best_train_acc
    print(model_info)

    model_info_file_name = f'{userName}_{projectName}_{projectType}.json'
    with open(model_info_file_name, "w") as outfile:
        json.dump(model_info, outfile)

    print('Saving model info and model to s3')
    #    S3_BUCKET_OUTPUT = 'gauravp-eva4-capstone-models'
    # Find number of users
    #    s3 = boto3.client('s3',aws_access_key_id='aws_access_key_id',aws_secret_access_key='aws_secret_access_key')

    s3.upload_file(
        model_info_file_name,
        S3_BUCKET_OUTPUT,
        model_info_file_name,
    )
    s3.upload_file(savedModelName, S3_BUCKET_OUTPUT, savedModelName)
    s3.upload_file(savedTokenizerName, S3_BUCKET_OUTPUT, savedTokenizerName)
    print("Done!!!")
Example #12
0
def train_section_model(case_folder, params=None):
    """Trains a section formatting model. If no specific parameters are specified, the best identified values are used.
    OUTPUT: Trained model, text vocabulary and label vocabulary"""
    _prepare_data(case_folder)
    if params is None:
        params = {'embedding_dim': 100, 'num_hidden_nodes': 32, 'num_output_nodes': 5, 'bidirection': True,
                  'num_layers': 2, 'dropout': 0.2}
    t.backends.cudnn.deterministic = True
    TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
    LABEL = data.LabelField(dtype=t.long, batch_first=True)
    fields = [('text', TEXT), ('label', LABEL)]
    training_data = data.TabularDataset(path='externals/tmp/dataset.csv', format='csv', fields=fields,
                                        skip_header=True)
    train_data, valid_data = training_data.split(split_ratio=0.2, random_state=random.seed(2020))
    TEXT.build_vocab(training_data, min_freq=1, vectors="glove.6B.100d")
    LABEL.build_vocab(training_data)
    # check whether cuda is available
    device = t.device('cuda' if t.cuda.is_available() else 'cpu')
    # set batch size
    BATCH_SIZE = 32

    # Load an iterator
    train_iterator, valid_iterator = data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=BATCH_SIZE,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        device=device)
    # define hyperparameters
    size_of_vocab = len(TEXT.vocab)
    params['size_of_vocab'] = size_of_vocab
    # instantiate the model
    model = internal_functions.classifier(size_of_vocab, params['embedding_dim'], params['num_hidden_nodes'],
                                          params['num_output_nodes'],
                                          params['num_layers'], bidirectional=params['bidirection'],
                                          dropout=params['dropout'])

    # Initialize the pretrained embedding
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)
    model, optimizer, criterion = internal_functions.optimizer_and_loss(model, device)

    # Now, we train the model
    N_EPOCHS = 10
    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):

        # train the model
        model, train_loss, train_acc = internal_functions.train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = internal_functions.evaluate(model, valid_iterator, criterion)

        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            _save_obj({'params': params, 'model': model.state_dict(), 'vocab_dict': TEXT.vocab.stoi,
                       'label_dict': LABEL.vocab.stoi, 'acc': valid_acc, 'timestamp': datetime.datetime.utcnow()},
                      'externals/tmp/section_model')

        # print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        # print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')
    os.remove('externals/tmp/dataset.csv')
    if not os.path.exists('externals/'+device.type+'_section_model.pkl'):
        shutil.move('externals/tmp/section_model.pkl', 'externals/'+device.type+'_section_model.pkl')
    return model, TEXT.vocab.stoi, LABEL.vocab.stoi
Example #13
0
    def test_stratified_dataset_split(self):
        num_examples, num_labels = 30, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        text_field = data.Field()
        label_field = data.LabelField()
        fields = [('text', text_field), ('label', label_field)]

        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        # Default split ratio
        expected_train_size = 21
        expected_test_size = 9

        train, test = dataset.split(stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test array arguments with same ratio
        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test strata_field argument
        train, test = dataset.split(split_ratio=split_ratio,
                                    stratified=True,
                                    strata_field='label')
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test invalid field name
        strata_field = 'dummy'
        with pytest.raises(ValueError):
            dataset.split(split_ratio=split_ratio,
                          stratified=True,
                          strata_field=strata_field)

        # Test uneven stratify sizes
        num_examples, num_labels = 28, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        # 10 examples for class 1 and 9 examples for classes 2,3
        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        expected_train_size = 7 + 6 + 6
        expected_test_size = 3 + 3 + 3
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Add validation set
        split_ratio = [0.6, 0.3, 0.1]
        expected_train_size = 6 + 5 + 5
        expected_valid_size = 1 + 1 + 1
        expected_test_size = 3 + 3 + 3
        train, valid, test = dataset.split(split_ratio=split_ratio,
                                           stratified=True)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size
Example #14
0
    def test_dataset_split_arguments(self):
        num_examples, num_labels = 30, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        text_field = data.Field()
        label_field = data.LabelField()
        fields = [('text', text_field), ('label', label_field)]

        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        # Test default split ratio (0.7)
        expected_train_size = 21
        expected_test_size = 9

        train, test = dataset.split()
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test array arguments with same ratio
        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Add validation set
        split_ratio = [0.6, 0.3, 0.1]
        expected_train_size = 18
        expected_valid_size = 3
        expected_test_size = 9

        train, valid, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size

        # Test ratio normalization
        split_ratio = [6, 3, 1]
        train, valid, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size

        # Test only two splits returned for too small valid split size
        split_ratio = [0.66, 0.33, 0.01]
        expected_length = 2
        splits = dataset.split(split_ratio=split_ratio)
        assert len(splits) == expected_length

        # Test invalid arguments
        split_ratio = 1.1
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = -1.
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = [0.7]
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = [1, 2, 3, 4]
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = "string"
        with pytest.raises(ValueError):
            dataset.split(split_ratio=split_ratio)
Example #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', type=Path, required=True)
    parser.add_argument('--dev', type=Path, required=True)
    parser.add_argument('--output-dir', type=Path, required=True)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--batch_size', type=int, default=512)

    args = parser.parse_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    TEXT = data.Field(tokenize='spacy', lower=True)
    LABEL = data.LabelField()
    train_csv = os.path.join('/', args.output_dir, 'train.csv')
    json_to_csv(args.train, train_csv)
    dev_csv = os.path.join('/', args.output_dir, 'dev.csv')
    json_to_csv(args.dev, dev_csv)

    train_data, val_data = data.TabularDataset.splits(path=args.output_dir,
                                                      train='train.csv',
                                                      validation='dev.csv',
                                                      format='csv',
                                                      skip_header=True,
                                                      fields=[
                                                          ('sentence1', TEXT),
                                                          ('sentence2', TEXT),
                                                          ('gold_label', LABEL)
                                                      ])

    TEXT.build_vocab(train_data,
                     min_freq=2,
                     vectors="glove.6B.300d",
                     unk_init=torch.Tensor.normal_)
    field_path = os.path.join('/', args.output_dir, 'bilstm-field.pt')
    torch.save(TEXT, field_path, pickle_module=dill)
    LABEL.build_vocab(train_data)

    train_iterator, valid_iterator = data.BucketIterator.splits(
        (train_data, val_data),
        batch_size=args.batch_size,
        device=device,
        sort_key=lambda x: len(x.sentence1),
        sort_within_batch=False)

    pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
    model = BiLSTM(input_dim=len(TEXT.vocab),
                   embedding_dim=300,
                   hidden_dim=300,
                   lstm_layers=2,
                   fc_layers=3,
                   output_dim=len(LABEL.vocab),
                   dropout=0.25,
                   pad_idx=pad_idx).to(device)

    model.embedding.weight.data[pad_idx] = torch.zeros(300)
    model.embedding.weight.requires_grad = True
    optimizer = optim.Adam(model.parameters())
    ce_loss = nn.CrossEntropyLoss().to(device)
    #torch.set_default_tensor_type('torch.cuda.FloatTensor')

    best_valid_loss = float('inf')
    model_path = os.path.join('/', args.output_dir, 'bilstm.pt')

    for epoch in range(args.epochs):

        train_loss = 0
        train_acc = 0
        model.train()

        for batch in train_iterator:
            prem = batch.sentence1
            hypo = batch.sentence2
            labels = batch.gold_label

            optimizer.zero_grad()
            predictions = model(prem, hypo)
            loss = ce_loss(predictions, labels)
            acc = accuracy(predictions, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_acc += acc.item()

        train_loss = train_loss / len(train_iterator)
        train_acc = train_acc / len(train_iterator)

        valid_loss = 0
        valid_acc = 0

        model.eval()

        with torch.no_grad():

            for batch in valid_iterator:

                prem = batch.sentence1
                hypo = batch.sentence2
                labels = batch.gold_label

                predictions = model(prem, hypo)
                loss = ce_loss(predictions, labels)
                acc = accuracy(predictions, labels)

                valid_loss += loss.item()
                valid_acc += acc.item()

        valid_loss = valid_loss / len(valid_iterator)
        valid_acc = valid_acc / len(valid_iterator)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_path)

        print(f'Epoch: {epoch+1:02}')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
        )
Example #16
0
def main(config):

    if not os.path.exists(config.model_dir):
        os.makedirs(config.model_dir)

    if not os.path.exists(config.log_dir):
        os.makedirs(config.log_dir)

    print("\t \t \t the model name is {}".format(config.model_name))
    device, n_gpu = get_device()

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法,保证每次结果一样
    """ sst2 数据准备 """
    text_field = data.Field(tokenize='spacy',
                            lower=True,
                            include_lengths=True,
                            fix_length=config.sequence_length)
    label_field = data.LabelField(dtype=torch.long)

    train_iterator, dev_iterator, test_iterator = load_sst2(
        config.data_path, text_field, label_field, config.batch_size, device,
        config.glove_word_file, config.cache_path)
    """ 词向量准备 """
    pretrained_embeddings = text_field.vocab.vectors

    model_file = config.model_dir + 'model1.pt'
    """ 模型准备 """
    if config.model_name == "TextCNN":
        from TextCNN import TextCNN
        filter_sizes = [int(val) for val in config.filter_sizes.split()]
        model = TextCNN.TextCNN(config.glove_word_dim, config.filter_num,
                                filter_sizes, config.output_dim,
                                config.dropout, pretrained_embeddings)
    elif config.model_name == "TextRNN":
        from TextRNN import TextRNN
        model = TextRNN.TextRNN(config.glove_word_dim, config.output_dim,
                                config.hidden_size, config.num_layers,
                                config.bidirectional, config.dropout,
                                pretrained_embeddings)

    elif config.model_name == "LSTMATT":
        from LSTM_ATT import LSTMATT
        model = LSTMATT.LSTMATT(config.glove_word_dim, config.output_dim,
                                config.hidden_size, config.num_layers,
                                config.bidirectional, config.dropout,
                                pretrained_embeddings)
    elif config.model_name == 'TextRCNN':
        from TextRCNN import TextRCNN
        model = TextRCNN.TextRCNN(config.glove_word_dim, config.output_dim,
                                  config.hidden_size, config.num_layers,
                                  config.bidirectional, config.dropout,
                                  pretrained_embeddings)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if config.do_train:
        train(config.epoch_num, model, train_iterator, dev_iterator, optimizer,
              criterion, ['0', '1'], model_file, config.log_dir,
              config.print_step, 'word')

    model.load_state_dict(torch.load(model_file))

    test_loss, test_acc, test_report = evaluate(model, test_iterator,
                                                criterion, ['0', '1'], 'word')
    print("-------------- Test -------------")
    print(
        "\t Loss: {} | Acc: {} | Micro avg F1: {} | Macro avg F1: {} | Weighted avg F1: {}"
        .format(test_loss, test_acc, test_report['micro avg']['f1-score'],
                test_report['macro avg']['f1-score'],
                test_report['weighted avg']['f1-score']))
def main(file_path, batch_size, base_model, num_epochs):
    """Train movie sentiment model"""

    # %%
    # base_model  = "roberta-base"
    # batch_size=8
    # num_epochs=5
    print("Initializing models")

    tokenizer = RobertaTokenizerFast.from_pretrained(base_model)
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    model = RoBERTaSentimentClassifier(device=device, base_model=base_model)

    print(f"Using device {model.device}")

    #%%
    train_cache = Path(".data/cache/train_data")
    val_cache = Path(".data/cache/validate_data")

    if train_cache.exists() and val_cache.exists():
        print("Load cached datasets")
        train = load_cached_dataset(train_cache)
        val = load_cached_dataset(val_cache)
    else:
        print("Generating datasets")
        PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
        UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

        # set up fields
        TEXT = data.Field(use_vocab=False,
                          include_lengths=False,
                          batch_first=True,
                          lower=False,
                          fix_length=512,
                          tokenize=tokenizer.encode,
                          pad_token=PAD_INDEX,
                          unk_token=UNK_INDEX)

        LABEL = data.LabelField()

        # make splits for data
        train, test = datasets.IMDB.splits(TEXT, LABEL)

        LABEL.build_vocab(train)

        test, val = test.split(split_ratio=0.9)

        print("Cache train and validate sets")

        save_cached_dataset(train, train_cache)
        save_cached_dataset(val, val_cache)

    print("Prepare dataset iterators")
    # make iterator for splits
    train_iter, val_iter = data.BucketIterator.splits((train, val),
                                                      batch_size=batch_size,
                                                      device=device)

    #%%
    for batch in val_iter:
        if batch.text.shape[0] != batch.label.shape[0]:
            print(batch)
        # print(batch.text.shape, batch.label.shape)
        # break
    #%%
    #dir(val_iter)
    #%%
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []
    best_valid_loss = float("Inf")

    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

    for item in train_iter:
        print(item)
        break

    print("Start training")
    for epoch in range(1, num_epochs + 1):

        print(f"Epoch {epoch}")

        train_iter.init_epoch()
        val_iter.init_epoch()

        for i, (text, labels) in enumerate(tqdm(train_iter, desc="train")):
            labels = labels.type(torch.LongTensor)
            labels = labels.to(device)
            output = model(text, labels)

            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

        model.eval()
        with torch.no_grad():

            answers = []

            # validation loop
            for i, (text, labels) in enumerate(tqdm(val_iter,
                                                    desc="validate")):
                labels = labels.type(torch.LongTensor)
                labels = labels.to(device)
                output = model(text, labels)
                loss, preds = output

                correct = torch.argmax(preds, dim=1) == labels

                answers.extend(correct.cpu().tolist())

                valid_running_loss += loss.item()

            average_accuracy = sum([1 for a in answers if a]) / len(answers)

            # evaluation
            average_train_loss = running_loss / epoch
            average_valid_loss = valid_running_loss / 10
            train_loss_list.append(average_train_loss)
            valid_loss_list.append(average_valid_loss)
            global_steps_list.append(global_step)

            # resetting running values
            running_loss = 0.0
            valid_running_loss = 0.0
            model.train()

            # print progress
            print(
                'Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}, Valid Acc: {:.4f}'
                .format(epoch + 1, num_epochs, global_step,
                        num_epochs * len(train_iter), average_train_loss,
                        average_valid_loss, average_accuracy))

            # checkpoint
            if best_valid_loss > average_valid_loss:
                best_valid_loss = average_valid_loss
                save_checkpoint(file_path + '/' + 'model.pt', model,
                                best_valid_loss)
                save_metrics(file_path + '/' + 'metrics.pt', train_loss_list,
                             valid_loss_list, global_steps_list)

    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list,
                 valid_loss_list, global_steps_list)
    print('Finished Training!')
def create_fields():
    TEXT = Field(sequential=True, tokenize="basic_english")

    LABEL = data.LabelField(dtype=torch.float)

    return TEXT, LABEL
    prediction = model(tensor, length_tensor)  #prediction
    return prediction.item()


SEED = 42
BATCH_SIZE = 64
torch.manual_seed(SEED)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)
fields = [(None, None), ('text', TEXT), ('label', LABEL)]

training_data = data.TabularDataset(path='quora.csv',
                                    format='csv',
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=0.8,
                                             random_state=random.seed(SEED))

TEXT.build_vocab(train_data, min_freq=3, vectors="glove.6B.100d")
size_of_vocab = len(TEXT.vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = classifier(size_of_vocab,
Example #20
0
            np.random.binomial(1,
                               p=self.p_word_dropout,
                               size=tuple(data.size())).astype('uint8'))

        if self.gpu:
            mask = mask.cuda()

        # Set to <unk>
        data[mask] = self.UNK_IDX

        return Variable(data)


###########################################################################
mTEXT = data.Field(tokenize='spacy')
mLABEL = data.LabelField(tensor_type=torch.FloatTensor)

print("loading dataset male_sent_obftrain_less700.tsv...")
mtrain = data.TabularDataset.splits(path='../sent/ori_gender_data/',
                                    train='male_sent_obftrain_less700.tsv',
                                    format='tsv',
                                    fields=[('Text', mTEXT),
                                            ('Label', mLABEL)])[0]

print("creating vocab for mTEXT")
mTEXT.build_vocab(mtrain, max_size=60000, vectors="glove.6B.100d")
mLABEL.build_vocab(mtrain)

mLABEL.vocab.stoi['1'] = 1
mLABEL.vocab.stoi['2'] = 2
mLABEL.vocab.stoi['3'] = 3
Example #21
0
def main(config, model_filename):
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)

    if not os.path.exists(config.cache_dir):
        os.makedirs(config.cache_dir)

    model_file = os.path.join(config.output_dir, model_filename)

    # Prepare the device
    gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()]
    device, n_gpu = get_device(gpu_ids[0])
    if n_gpu > 1:
        n_gpu = len(gpu_ids)

    # Set Random Seeds
    random.seed(config.seed)
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True

    # Prepare the data
    id_field = data.RawField()
    id_field.is_target = False
    text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True)
    label_field = data.LabelField(dtype=torch.long)

    train_iterator, dev_iterator, test_iterator = load_data(
        config.data_path, id_field, text_field, label_field,
        config.train_batch_size, config.dev_batch_size, config.test_batch_size,
        device, config.glove_word_file, config.cache_dir)

    # Word Vector
    word_emb = text_field.vocab.vectors

    if config.model_name == "GAReader":
        from Baselines.GAReader.GAReader import GAReader
        model = GAReader(config.glove_word_dim, config.output_dim,
                         config.hidden_size, config.rnn_num_layers,
                         config.ga_layers, config.bidirectional,
                         config.dropout, word_emb)
        print(model)

    # optimizer = optim.Adam(model.parameters(), lr=config.lr)
    optimizer = optim.SGD(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if config.do_train:
        train(config.epoch_num, model, train_iterator, dev_iterator, optimizer,
              criterion, ['0', '1', '2', '3', '4'], model_file, config.log_dir,
              config.print_step, config.clip)

    model.load_state_dict(torch.load(model_file))

    test_loss, test_acc, test_report = evaluate(model, test_iterator,
                                                criterion,
                                                ['0', '1', '2', '3', '4'])
    print("-------------- Test -------------")
    print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}".
          format(test_loss, test_acc, test_report['macro avg']['f1-score'],
                 test_report['weighted avg']['f1-score']))
Example #22
0
def main(args):

    aspects = []
    with open(args.dataset + "/" + args.dataset + "_aspects.txt") as f:
        for line in f:
            lst = line.split()
            aspect = lst[0].lower()
            aspects.append(aspect)
    print(aspects)

    TEXT = data.Field(tokenize=tokenizer)
    train_data = data.TabularDataset(path=args.dataset + "/" + args.dataset +
                                     "_train.csv",
                                     format='csv',
                                     fields=[('text', TEXT)])
    LABEL = data.LabelField()
    test_data = data.TabularDataset(path=args.dataset + "/" + args.dataset +
                                    "_test.csv",
                                    format='csv',
                                    fields=[('text', TEXT), ('label', LABEL)])
    embedding = torchtext.vocab.Vectors(args.dataset + "/" + args.dataset +
                                        ".200d.txt")

    MAX_VOCAB_SIZE = 40000
    TEXT.build_vocab(train_data,
                     max_size=MAX_VOCAB_SIZE,
                     vectors=embedding,
                     unk_init=torch.Tensor.normal_)
    LABEL.build_vocab(test_data)
    print(LABEL.vocab.stoi)
    print(LABEL.vocab.itos)
    BATCH_SIZE = int(len(train_data) / 500)

    if torch.cuda.is_available():
        torch.cuda.set_device(6)
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')

    train_iterator = data.BucketIterator(train_data,
                                         batch_size=BATCH_SIZE,
                                         device=device,
                                         sort=False)
    test_iterator = data.BucketIterator(test_data,
                                        batch_size=len(test_data),
                                        device=device,
                                        sort=False)

    LABEL_KPLUS = data.LabelField()
    test_kplus_data = data.TabularDataset(path=args.dataset + "/" +
                                          args.dataset + "_test_kplus.csv",
                                          format='csv',
                                          fields=[('text', TEXT),
                                                  ('label', LABEL_KPLUS)])
    test_kplus_iterator = data.BucketIterator(test_kplus_data,
                                              batch_size=len(test_kplus_data),
                                              device=device,
                                              sort=False)
    LABEL_KPLUS.build_vocab(test_kplus_data)
    print(LABEL_KPLUS.vocab.stoi)

    from sklearn import metrics

    def train_metric(preds, label):
        max_preds = preds.argmax(dim=1)
        max_label = label.argmax(dim=1)
        acc = metrics.accuracy_score(max_label.cpu().numpy(),
                                     max_preds.cpu().numpy())
        return acc

    def train(model, pseudolabel, iterator, optimizer):

        criterion = nn.KLDivLoss()

        epoch_loss = 0
        epoch_acc = 0

        model.train()
        pseudolabel.eval()

        for batch in iterator:

            optimizer.zero_grad()

            probs, _ = model(batch.text)  #[batch size, output dim]

            p, q = pseudolabel(batch.text)

            loss = criterion(torch.log(probs), p.detach())

            acc = train_metric(probs, p.detach())

            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    def evaluate(model, eval_data, LABEL):
        preds = []
        labels = []
        for e in eval_data.examples:
            pred = predict(model, e.text)
            preds.append(pred)
            labels.append(LABEL.vocab.stoi[e.label])

        f1 = metrics.f1_score(labels, preds, average='weighted')
        acc = metrics.accuracy_score(labels, preds)
        return acc, f1

    def predict_class(model, sentence, min_len=5):
        model.eval()
        tokenized = [tok for tok in tokenizer(sentence)]
        if len(tokenized) < min_len:
            tokenized += ['<pad>'] * (min_len - len(tokenized))
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        preds, _ = model(tensor)
        max_preds = preds.argmax(dim=1)
        return max_preds.item()

    def predict(model, sentence, min_len=5):
        model.eval()
        if len(sentence) < min_len:
            sentence += ['<pad>'] * (min_len - len(sentence))
        indexed = [TEXT.vocab.stoi[t] for t in sentence]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        preds, _ = model(tensor)
        max_preds = preds.argmax(dim=1)
        return max_preds.item()

    def predict_pseudolabel(model, sentence, min_len=5):
        model.eval()
        if len(sentence) < min_len:
            sentence += ['<pad>'] * (min_len - len(sentence))
        indexed = [TEXT.vocab.stoi[t] for t in sentence]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        _, preds = model(tensor)
        max_preds = preds.argmax(dim=1)
        return max_preds.item()

    def get_qs(model, sentence, min_len=5):
        model.eval()
        if len(sentence) < min_len:
            sentence += ['<pad>'] * (min_len - len(sentence))
        indexed = [TEXT.vocab.stoi[t] for t in sentence]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        p, q = model(tensor)
        max_q = torch.max(q, 1)[1]
        return q, max_q

    def get_p(model, sentence, min_len=5):
        model.eval()
        if len(sentence) < min_len:
            sentence += ['<pad>'] * (min_len - len(sentence))
        indexed = [TEXT.vocab.stoi[t] for t in sentence]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        preds, classes = model(tensor)
        return preds, classes

    import datetime
    time = int(datetime.datetime.now().timestamp())

    if not os.path.exists('outputs'):
        os.makedirs('outputs')

    import logging
    logging.basicConfig(filename='outputs/' + str(time) + 'train-' +
                        args.dataset + '.log',
                        level=logging.DEBUG)
    logging.debug("no filtering: " + str(args.no_filtering))
    logging.debug("no tuning: " + str(args.no_tuning))

    import collections
    seed_words_d = collections.defaultdict(set)
    with open(args.dataset + "/" + args.dataset + "_seeds.txt") as f:
        for line in f:
            lst = line.split()
            w1 = lst[0].lower()
            w2 = lst[1].lower()
            seed_words_d[w2].add(w1)

    seed_words = sorted(seed_words_d.items(),
                        key=lambda x: LABEL.vocab.stoi[x[0]])
    print(seed_words)

    def get_seed_embedding(seed_words):
        SEED_WORDS = []
        for w, lst in seed_words:
            temp = []
            for e in lst:
                temp.append(
                    TEXT.vocab.vectors[TEXT.vocab.stoi[e]].unsqueeze(0))
            embeds = torch.cat(temp)
            embed = torch.mean(embeds, dim=0)
            SEED_WORDS.append(embed.unsqueeze(0))
        SEED_WORDS = torch.cat(SEED_WORDS)
        SEED_WORDS = SEED_WORDS.unsqueeze(1)
        SEED_WORDS = SEED_WORDS.unsqueeze(1)
        return SEED_WORDS

    SEED_WORDS = get_seed_embedding(seed_words)
    print(SEED_WORDS.shape)

    def init_kmodel(SEED_WORDS):
        INPUT_DIM = len(TEXT.vocab)
        EMBEDDING_DIM = 200
        N_FILTERS = 100
        FILTER_SIZES = [2, 3, 4]
        KOUTPUT_DIM = len(LABEL.vocab)
        DROPOUT = 0.5
        PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

        k_model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES,
                      KOUTPUT_DIM, DROPOUT, PAD_IDX)
        k_model = k_model.to(device)
        #k_model.load_state_dict(torch.load('k-model.pt'))

        k_pseudolabel = PseudoLabel(INPUT_DIM, EMBEDDING_DIM, KOUTPUT_DIM,
                                    KOUTPUT_DIM, PAD_IDX, SEED_WORDS)
        k_pseudolabel.eval()
        k_pseudolabel = k_pseudolabel.to(device)

        pretrained_embeddings = TEXT.vocab.vectors

        k_model.embedding.weight.data.copy_(pretrained_embeddings)
        k_pseudolabel.embedding.weight.data.copy_(pretrained_embeddings)

        UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

        k_model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
        k_model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

        k_pseudolabel.embedding.weight.data[UNK_IDX] = torch.zeros(
            EMBEDDING_DIM)
        k_pseudolabel.embedding.weight.data[PAD_IDX] = torch.zeros(
            EMBEDDING_DIM)
        return k_model, k_pseudolabel

    k_model, k_pseudolabel = init_kmodel(SEED_WORDS)
    k_model_optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, k_model.parameters()))

    N_EPOCHS = 5
    for epoch in range(N_EPOCHS):

        print("epoch: ", epoch + 1)

        train_loss, train_acc = train(k_model, k_pseudolabel, train_iterator,
                                      k_model_optimizer)

        print("training loss: ", train_loss)
        print("training accuracy: ", train_acc)

        valid_acc, valid_f1 = evaluate(k_model, test_data, LABEL)

        print("validation accuracy: ", valid_acc)
        print('validation F1:', valid_f1)
    torch.cuda.empty_cache()

    preds = []
    labels = []
    for e in test_data.examples:
        pred = predict(k_model, e.text)
        preds.append(pred)
        labels.append(LABEL.vocab.stoi[e.label])

    def log_info(labels, preds):
        print(metrics.accuracy_score(labels, preds))
        logging.debug(metrics.accuracy_score(labels, preds))
        print(metrics.precision_score(labels, preds, average='weighted'))
        logging.debug(
            metrics.precision_score(labels, preds, average='weighted'))
        print(metrics.recall_score(labels, preds, average='weighted'))
        logging.debug(metrics.recall_score(labels, preds, average='weighted'))
        print(metrics.f1_score(labels, preds, average='weighted'))
        logging.debug(metrics.f1_score(labels, preds, average='weighted'))
        m = confusion_matrix(labels, preds)
        print(m)
        logging.debug(m)

    log_info(labels, preds)

    logging.debug("k pseudolabel")
    preds = []
    labels = []
    for e in test_data.examples:
        pred = predict_pseudolabel(k_pseudolabel, e.text)
        preds.append(pred)
        labels.append(LABEL.vocab.stoi[e.label])
    log_info(labels, preds)

    def compute_threshold():
        lst1 = []
        lst2 = []
        for e in train_data.examples:
            qs, _ = get_qs(k_pseudolabel, e.text)
            preds, _ = get_p(k_model, e.text)
            vs = [v.item() for v in preds.squeeze(0) if v.item() != 0]
            h_norm = (-1 / math.log(preds.shape[1])) * sum(
                [v * math.log(v) for v in vs])
            #if e.label == 'miscellaneous':
            #lst1.append(int(h_norm*100))
            #else:
            lst2.append(int(h_norm * 100))

        a = np.array(lst2)
        threshold = np.quantile(a, args.quantile) / 100
        return threshold

    threshold = compute_threshold()
    #threshold = 0.2
    print("threshold:", threshold)
    logging.debug("threshold:" + str(threshold))

    def init_kplusmodel(SEED_WORDS):
        INPUT_DIM = len(TEXT.vocab)
        EMBEDDING_DIM = 200
        N_FILTERS = 100
        FILTER_SIZES = [2, 3, 4]
        OUTPUT_DIM = len(LABEL_KPLUS.vocab)
        DROPOUT = 0.5
        PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

        kplus_model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES,
                          OUTPUT_DIM, DROPOUT, PAD_IDX)
        kplus_model = kplus_model.to(device)

        kplus_pseudolabel = PseudoLabelPlus(INPUT_DIM, EMBEDDING_DIM,
                                            OUTPUT_DIM - 1, OUTPUT_DIM - 1,
                                            PAD_IDX, SEED_WORDS, k_model,
                                            threshold, args.upperbound, device,
                                            LABEL_KPLUS.vocab.stoi)
        kplus_pseudolabel = kplus_pseudolabel.to(device)
        kplus_pseudolabel.eval()

        pretrained_embeddings = TEXT.vocab.vectors

        kplus_model.embedding.weight.data.copy_(pretrained_embeddings)
        kplus_pseudolabel.embedding.weight.data.copy_(pretrained_embeddings)

        UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

        kplus_model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
        kplus_model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

        kplus_pseudolabel.embedding.weight.data[UNK_IDX] = torch.zeros(
            EMBEDDING_DIM)
        kplus_pseudolabel.embedding.weight.data[PAD_IDX] = torch.zeros(
            EMBEDDING_DIM)
        return kplus_model, kplus_pseudolabel

    kplus_model, kplus_pseudolabel = init_kplusmodel(SEED_WORDS)
    kplus_model_optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, kplus_model.parameters()))

    N_EPOCHS = 5

    for epoch in range(N_EPOCHS):

        print("epoch: ", epoch + 1)

        train_loss, train_acc = train(kplus_model, kplus_pseudolabel,
                                      train_iterator, kplus_model_optimizer)

        print("training loss: ", train_loss)
        print("training accuracy: ", train_acc)

        valid_acc, valid_f1 = evaluate(kplus_model, test_kplus_data,
                                       LABEL_KPLUS)

        print("validation accuracy: ", valid_acc)
        print('validation F1:', valid_f1)

    torch.cuda.empty_cache()

    preds = []
    labels = []
    for e in test_kplus_data.examples:
        pred = predict(kplus_model, e.text)
        preds.append(pred)
        labels.append(LABEL_KPLUS.vocab.stoi[e.label])
    log_info(labels, preds)

    logging.debug("kplus pseudolabel")
    preds = []
    labels = []
    for e in test_kplus_data.examples:
        pred = predict_pseudolabel(kplus_pseudolabel, e.text)
        preds.append(pred)
        labels.append(LABEL_KPLUS.vocab.stoi[e.label])
    log_info(labels, preds)

    import nltk
    import string
    from nltk.corpus import stopwords
    stop_words_en = list(set(stopwords.words('english')))
    stop_words_fr = list(set(stopwords.words('french')))
    stop_words_sp = list(set(stopwords.words('spanish')))
    stop_words = set(stop_words_en + stop_words_fr + stop_words_sp)

    def update_seeds(seed_words_d, no_filtering, no_tuning):
        tf1 = collections.defaultdict(dict)
        pool1 = collections.defaultdict(dict)
        kl = nn.KLDivLoss()
        for e in test_data.examples:
            #p_orig, label = get_qs(k_pseudolabel, e.text)
            p_orig, label = get_p(k_model, e.text)
            kls = []
            words = []
            #label = LABEL.vocab.itos[label.item()]
            label = e.label
            for i in range(len(e.text)):
                tmp = e.text[i]
                if tmp not in tf1[label]:
                    tf1[label][tmp] = 0
                tf1[label][tmp] += 1
                if tmp in stop_words or tmp in string.punctuation or tmp == '<unk>' or tmp == '<pad>':
                    continue
                e.text[i] = '<unk>'
                #p_new, _ = get_qs(k_pseudolabel, e.text)
                p_new, _ = get_p(k_model, e.text)
                loss = kl(torch.log(p_orig.detach()), p_new.detach())
                kls.append(loss.item())
                words.append(tmp)
                e.text[i] = tmp
            lst = list(zip(words, kls))
            lst.sort(key=lambda x: x[1], reverse=True)

            #print(lst[:len_])
            if not no_tuning:
                for i in range(len(lst) // 4):
                    threshold = 5e-2
                    if lst[i][1] > threshold:
                        if lst[i][0] not in pool1[label]:
                            pool1[label][lst[i][0]] = 0
                        pool1[label][lst[i][0]] += lst[i][1]

        pops1 = collections.defaultdict(dict)
        aspects1 = list(tf1.keys())
        for i in range(len(aspects1)):
            for word in tf1[aspects1[i]]:
                sum_ = 0
                for j in range(len(aspects1)):
                    if word in tf1[aspects1[j]]:
                        sum_ += tf1[aspects1[j]][word]
                pops1[aspects1[i]][word] = tf1[aspects1[i]][word] / sum_

        dists1 = collections.defaultdict(dict)
        for i in range(len(aspects1)):
            for word in tf1[aspects1[i]]:
                max_ = 0
                for j in range(len(aspects1)):
                    if word in tf1[aspects1[j]]:
                        max_ = max(max_, tf1[aspects1[j]][word])
                dists1[aspects1[i]][word] = tf1[aspects1[i]][word] / max_

        scores1 = collections.defaultdict(dict)
        for i in range(len(aspects1)):
            if no_tuning:
                for word in tf1[aspects1[i]]:
                    scores1[aspects1[i]][word] = pops1[
                        aspects1[i]][word] * dists1[aspects1[i]][word]
            else:
                for word in pool1[aspects1[i]]:
                    scores1[aspects1[i]][word] = pops1[
                        aspects1[i]][word] * dists1[aspects1[i]][word]

        candidates1 = collections.defaultdict(list)
        for aspect in aspects1:
            candidates1[aspect] = sorted(scores1[aspect].items(),
                                         key=lambda x: x[1],
                                         reverse=True)

        commons1 = set()
        aspects1 = list(candidates1.keys())
        for i in range(len(aspects1) - 1):
            for j in range(i + 1, len(aspects1)):
                lst1, _ = zip(*candidates1[aspects1[i]])
                lst2, _ = zip(*candidates1[aspects1[j]])
                common = set.intersection(set(lst1), set(lst2))
                for c in common:
                    commons1.add(c)

        miscs = set()
        if not no_filtering:
            tf2 = collections.defaultdict(dict)
            pool2 = collections.defaultdict(dict)
            kl = nn.KLDivLoss()
            for e in test_kplus_data.examples:
                #p_orig, label = get_qs(kplus_pseudolabel, e.text)
                p_orig, label = get_p(kplus_model, e.text)
                kls = []
                words = []
                #label = LABEL.vocab.itos[label.item()]
                label = e.label
                for i in range(len(e.text)):
                    tmp = e.text[i]
                    if tmp not in tf2[label]:
                        tf2[label][tmp] = 0
                    tf2[label][tmp] += 1
                    if tmp in stop_words or tmp in string.punctuation or tmp == '<unk>' or tmp == '<pad>':
                        continue
                    e.text[i] = '<unk>'
                    #p_new, _ = get_qs(kplus_pseudolabel, e.text)
                    p_new, _ = get_p(kplus_model, e.text)
                    loss = kl(torch.log(p_orig.detach()), p_new.detach())
                    kls.append(loss.item())
                    words.append(tmp)
                    e.text[i] = tmp
                lst = list(zip(words, kls))
                lst.sort(key=lambda x: x[1], reverse=True)

                #print(lst[:len_])

                for i in range(len(lst) // 4):
                    threshold = 1e-2
                    if lst[i][1] > threshold:
                        if lst[i][0] not in pool2[label]:
                            pool2[label][lst[i][0]] = 0
                        pool2[label][lst[i][0]] += lst[i][1]

            pops2 = collections.defaultdict(dict)
            aspects2 = list(tf2.keys())
            for i in range(len(aspects2)):
                for word in tf2[aspects2[i]]:
                    sum_ = 0
                    for j in range(len(aspects2)):
                        if word in tf2[aspects2[j]]:
                            sum_ += tf2[aspects2[j]][word]
                    pops2[aspects2[i]][word] = tf2[aspects2[i]][word] / sum_

            dists2 = collections.defaultdict(dict)
            for i in range(len(aspects2)):
                for word in tf2[aspects2[i]]:
                    max_ = 0
                    for j in range(len(aspects2)):
                        if word in tf2[aspects2[j]]:
                            max_ = max(max_, tf2[aspects2[j]][word])
                    dists2[aspects2[i]][word] = tf2[aspects2[i]][word] / max_

            scores2 = collections.defaultdict(dict)
            for i in range(len(aspects2)):
                for word in pool2[aspects2[i]]:
                    scores2[aspects2[i]][word] = pops2[
                        aspects2[i]][word] * dists2[aspects2[i]][word]

            candidates2 = collections.defaultdict(list)

            for aspect in aspects2:
                candidates2[aspect] = sorted(scores2[aspect].items(),
                                             key=lambda x: x[1],
                                             reverse=True)

            print(candidates2['miscellaneous'])

            for i in range(len(candidates2['miscellaneous'])):
                word, score = candidates2['miscellaneous'][i]
                if score > 1e-2:
                    miscs.add(word)

        for aspect in aspects:
            if not no_filtering:
                for word in miscs:
                    if word in seed_words_d[aspect]:
                        seed_words_d[aspect].remove(word)
            i = 0
            while len(seed_words_d[aspect]) < args.seedword_limit and i < len(
                    candidates1[aspect]):
                word, score = candidates1[aspect][i]
                if word not in seed_words_d[
                        aspect] and word not in commons1 and word not in miscs and score >= args.score_threshold:
                    seed_words_d[aspect].add(word)
                i += 1

        commons2 = set()
        aspects2 = list(seed_words_d.keys())
        for i in range(len(aspects2) - 1):
            for j in range(i + 1, len(aspects2)):
                lst1 = seed_words_d[aspects2[i]]
                lst2 = seed_words_d[aspects2[j]]
                common = set.intersection(set(lst1), set(lst2))
                for c in common:
                    commons2.add(c)

        for aspect in aspects2:
            for c in commons2:
                if c in seed_words_d[aspect]:
                    seed_words_d[aspect].remove(c)

    update_seeds(seed_words_d, args.no_filtering, args.no_tuning)
    print(seed_words_d)

    for k in seed_words_d:
        seed_words_d[k] = list(seed_words_d[k])
    for k in seed_words_d:
        seed_words_d[k] = set(seed_words_d[k])

    seed_words = sorted(seed_words_d.items(),
                        key=lambda x: LABEL.vocab.stoi[x[0]])
    print(seed_words)
    logging.debug(seed_words)

    get_seed_embedding(seed_words)
    k_model, k_pseudolabel = init_kmodel(SEED_WORDS)
    k_model_optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, k_model.parameters()))

    N_EPOCHS = 5
    for epoch in range(N_EPOCHS):

        print("epoch: ", epoch + 1)

        train_loss, train_acc = train(k_model, k_pseudolabel, train_iterator,
                                      k_model_optimizer)

        print("training loss: ", train_loss)
        print("training accuracy: ", train_acc)

        valid_acc, valid_f1 = evaluate(k_model, test_data, LABEL)

        print("validation accuracy: ", valid_acc)
        print('validation F1:', valid_f1)
    torch.cuda.empty_cache()

    preds = []
    labels = []
    for e in test_data.examples:
        pred = predict(k_model, e.text)
        preds.append(pred)
        labels.append(LABEL.vocab.stoi[e.label])
    log_info(labels, preds)

    logging.debug("k pseudolabel")
    preds = []
    labels = []
    for e in test_data.examples:
        pred = predict_pseudolabel(k_pseudolabel, e.text)
        preds.append(pred)
        labels.append(LABEL.vocab.stoi[e.label])
    log_info(labels, preds)

    threshold = compute_threshold()
    #threshold = 0.2
    print("threshold:", threshold)
    logging.debug("threshold:" + str(threshold))

    kplus_model, kplus_pseudolabel = init_kplusmodel(SEED_WORDS)
    kplus_model_optimizer = optim.Adam(
        filter(lambda p: p.requires_grad, kplus_model.parameters()))

    N_EPOCHS = 5
    for epoch in range(N_EPOCHS):

        print("epoch: ", epoch + 1)

        train_loss, train_acc = train(kplus_model, kplus_pseudolabel,
                                      train_iterator, kplus_model_optimizer)

        print("training loss: ", train_loss)
        print("training accuracy: ", train_acc)

        valid_acc, valid_f1 = evaluate(kplus_model, test_kplus_data,
                                       LABEL_KPLUS)

        print("validation accuracy: ", valid_acc)
        print('validation F1:', valid_f1)
    torch.cuda.empty_cache()

    preds = []
    labels = []
    for e in test_kplus_data.examples:
        pred = predict(kplus_model, e.text)
        preds.append(pred)
        labels.append(LABEL_KPLUS.vocab.stoi[e.label])
    log_info(labels, preds)

    logging.debug("kplus pseudolabel")
    preds = []
    labels = []
    for e in test_kplus_data.examples:
        pred = predict_pseudolabel(kplus_pseudolabel, e.text)
        preds.append(pred)
        labels.append(LABEL_KPLUS.vocab.stoi[e.label])
    log_info(labels, preds)

    for i in range(3):
        logging.debug("iteration: " + str(i + 1))

        import copy
        seed_words_d_copy = copy.deepcopy(seed_words_d)

        update_seeds(seed_words_d, args.no_filtering, args.no_tuning)
        print(seed_words_d)

        seed_words = sorted(seed_words_d.items(),
                            key=lambda x: LABEL.vocab.stoi[x[0]])
        print(seed_words)
        logging.debug(seed_words)

        if seed_words_d == seed_words_d_copy:
            break

        get_seed_embedding(seed_words)
        k_model, k_pseudolabel = init_kmodel(SEED_WORDS)
        k_model_optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, k_model.parameters()))

        N_EPOCHS = 5
        for epoch in range(N_EPOCHS):

            print("epoch: ", epoch + 1)

            train_loss, train_acc = train(k_model, k_pseudolabel,
                                          train_iterator, k_model_optimizer)

            print("training loss: ", train_loss)
            print("training accuracy: ", train_acc)

            valid_acc, valid_f1 = evaluate(k_model, test_data, LABEL)

            print("validation accuracy: ", valid_acc)
            print('validation F1:', valid_f1)
        torch.cuda.empty_cache()

        preds = []
        labels = []
        for e in test_data.examples:
            pred = predict(k_model, e.text)
            preds.append(pred)
            labels.append(LABEL.vocab.stoi[e.label])
        log_info(labels, preds)

        logging.debug("k pseudolabel")
        preds = []
        labels = []
        for e in test_data.examples:
            pred = predict_pseudolabel(k_pseudolabel, e.text)
            preds.append(pred)
            labels.append(LABEL.vocab.stoi[e.label])
        log_info(labels, preds)

        threshold = compute_threshold()
        #threshold = 0.2
        print("threshold:", threshold)
        logging.debug("threshold:" + str(threshold))

        kplus_model, kplus_pseudolabel = init_kplusmodel(SEED_WORDS)
        kplus_model_optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, kplus_model.parameters()))

        N_EPOCHS_2 = 5
        for epoch in range(N_EPOCHS_2):

            print("epoch: ", epoch + 1)

            train_loss, train_acc = train(kplus_model, kplus_pseudolabel,
                                          train_iterator,
                                          kplus_model_optimizer)

            print("training loss: ", train_loss)
            print("training accuracy: ", train_acc)

            valid_acc, valid_f1 = evaluate(kplus_model, test_kplus_data,
                                           LABEL_KPLUS)

            print("validation accuracy: ", valid_acc)
            print('validation F1:', valid_f1)
        torch.cuda.empty_cache()

        preds = []
        labels = []
        for e in test_kplus_data.examples:
            pred = predict(kplus_model, e.text)
            preds.append(pred)
            labels.append(LABEL_KPLUS.vocab.stoi[e.label])
        log_info(labels, preds)

        logging.debug("kplus pseudolabel")
        preds = []
        labels = []
        for e in test_kplus_data.examples:
            pred = predict_pseudolabel(kplus_pseudolabel, e.text)
            preds.append(pred)
            labels.append(LABEL_KPLUS.vocab.stoi[e.label])
        log_info(labels, preds)
def cross_val_score(
    Model,
    model_kwargs,
    model_path,
    custom_embeddings,
    vocab_kwargs,
    data_path,
    label_column,
    text_column,
    other_fields,
    process_text,
    process_labels,
    Optimizer,
    optimizer_kwargs,
    criterion,
    batch_size,
    n_epochs,
    writer,
    device,
):
    p = Path(data_path)
    n_files = len(list(p.glob('*.json')))
    # тут мы полагаем что на каждый фолд должно приходится 2 файла: test и train
    assert n_files % 2 == 0
    n_splits = n_files // 2
    # будем поддерживать масивы с accuracy на валидации для последней эпохи, и accuracy на эпохе с лучшим лоссом
    best_accuracy = []
    final_accuracy = []
    for fold in range(n_splits):

        # всем используемым моделям нужны поля с текстом и целевым лейблом
        TEXT = data.Field(**process_text)
        LABEL = data.LabelField(**process_labels)
        fields = {label_column: ('label', LABEL), text_column: ('text', TEXT)}
        # некоторые модели требуют дополнительные поля, их опредялем в вызывающем контексте
        fields.update(other_fields)

        train_data = data.TabularDataset(
            path=Path(data_path, f'train_{fold}.json'),
            format='json',
            fields=fields,
        )
        test_data = data.TabularDataset(
            path=Path(data_path, f'test_{fold}.json'),
            format='json',
            fields=fields,
        )

        TEXT.build_vocab(train_data, vectors=custom_embeddings, **vocab_kwargs)
        LABEL.build_vocab(train_data)

        input_dim = len(TEXT.vocab)
        output_dim = len(LABEL.vocab)
        pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
        model = Model(input_dim, output_dim, pad_idx=pad_idx, **model_kwargs)

        if custom_embeddings is not None:
            embeddings = TEXT.vocab.vectors
            model.embedding.weight.data.copy_(embeddings)

        # явно зануляем ембеддинг для <pad>
        model.embedding.weight.data[pad_idx] = torch.zeros(
            model_kwargs['embedding_dim'])
        optimizer = Optimizer(model.parameters(), **optimizer_kwargs)
        model = model.to(device)
        criterion = criterion.to(device)

        train_iterator, test_iterator = data.BucketIterator.splits(
            (train_data, test_data),
            batch_size=batch_size,
            sort_key=lambda ex: len(ex.text),
            sort_within_batch=True,
            device=device)

        best_valid_acc, final_valid_acc = train_model(model,
                                                      train_iterator,
                                                      test_iterator,
                                                      optimizer,
                                                      criterion,
                                                      model_path + f'_{fold}',
                                                      n_epochs=n_epochs,
                                                      comment=f'fold_{fold}',
                                                      writer=writer)

        best_accuracy.append(best_valid_acc)
        final_accuracy.append(final_valid_acc)

    return best_accuracy, final_accuracy
    def initialise_train(self):

        #create random seeds
        SEED = 1234
        random.seed(SEED)
        np.random.seed(SEED)
        torch.manual_seed(SEED)
        torch.backends.cudnn.deterministic = True

        #Use torchtext.data to create dataset using random seeds
        TEXT = data.Field(batch_first=True,
                          use_vocab=False,
                          tokenize=self.tokenize_and_cut,
                          preprocessing=self.tokenizer.convert_tokens_to_ids,
                          init_token=self.init_token_idx,
                          eos_token=self.eos_token_idx,
                          pad_token=self.pad_token_idx,
                          unk_token=self.unk_token_idx)

        LABEL = data.LabelField(dtype=torch.float)

        train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
        train_data, valid_data = train_data.split(
            random_state=random.seed(SEED))

        print(f"Number of training examples: {len(train_data)}")
        print(f"Number of validation examples: {len(valid_data)}")
        print(f"Number of testing examples: {len(test_data)}")
        print(vars(train_data.examples[6]))
        tokens = self.tokenizer.convert_ids_to_tokens(
            vars(train_data.examples[6])['text'])
        print(tokens)
        LABEL.build_vocab(train_data)
        print(LABEL.vocab.stoi)

        #Freeze some model parameters to increase training speed
        for name, param in self.model.named_parameters():
            if name.startswith('bert'):
                param.requires_grad = False

        #Setup for training
        train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
            (train_data, valid_data, test_data),
            batch_size=self.BATCH_SIZE,
            device=self.device)
        optimizer = optim.Adam(self.model.parameters())
        criterion = nn.BCEWithLogitsLoss()
        self.model = self.model.to(self.device)
        criterion = criterion.to(self.device)
        N_EPOCHS = 5
        best_valid_loss = float('inf')

        #start training loop
        for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = self.train(self.model, train_iterator,
                                               optimizer, criterion)
            valid_loss, valid_acc = self.evaluate(self.model, valid_iterator,
                                                  criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = self.epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(self.model.state_dict(), 'tut6-model.pt')

            print(
                f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )
Example #25
0
    def __init__(self, args):
        self.TEXT = data.Field(lower=args.lower, tokenize=lambda x: x.split())
        # We'll use NestedField to tokenize each word into list of chars
        if args.char:
            CHAR_NESTING = data.Field()
            self.char_text = data.NestedField(CHAR_NESTING)
        self.LABEL = data.LabelField()
        self.ids = data.Field(sequential=True, use_vocab=True)
        if args.char:
            fields = {
                'question': ('question', self.TEXT),
                'char_question': ('question_c', self.char_text),
                'label': ('label', self.LABEL),
                'text': ('answer', self.TEXT),
                'char_text': ('answer_c', self.char_text)
            }
            test_fields = {
                '__id__': ('q_id', self.ids),
                'question': ('question', self.TEXT),
                'char_question': ('question_c', self.char_text),
                'text': ('answer', self.TEXT),
                'char_text': ('answer_c', self.char_text),
                'id': ('a_id', self.ids)
            }
        else:
            fields = {
                'question': ('question', self.TEXT),
                'label': ('label', self.LABEL),
                'text': ('answer', self.TEXT)
            }
            test_fields = {
                '__id__': ('q_id', self.ids),
                'id': ('a_id', self.ids),
                'question': ('question', self.TEXT),
                'text': ('answer', self.TEXT)
            }

        data_zalo = data.TabularDataset(
            path='../wikiqa_zalo/data/train_pr.json',
            format='json',
            fields=fields)
        data_submission = data.TabularDataset(
            path='../wikiqa_zalo/data/test_pr_submission.json',
            format='json',
            fields=test_fields)

        self.train, self.test = data_zalo.split(0.8,
                                                random_state=random.seed(SEED))
        self.train, self.dev = self.train.split(0.8,
                                                random_state=random.seed(SEED))
        # len(data_zalo), len(train), len(test), len(valid)

        self.TEXT.build_vocab(self.train, self.dev, self.test)
        self.ids.build_vocab(data_submission)
        if args.char:
            self.char_text.build_vocab(self.train, self.dev, self.test)

        if args.word_vectors:
            if os.path.isfile(args.vector_cache):
                print('Found pretrained word embeddings')
                cache, name = '/'.join(args.vector_cache.split('/')
                                       [:-1]), args.vector_cache.split('/')[-1]
                vectors = Vectors(cache=cache, name=name)
                self.TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors,
                                            vectors.dim)
                # inputs.vocab.vectors = torch.load(args.vector_cache)
            else:
                print('Not found pretrained word embeddings\nDownloading')
                self.TEXT.vocab.load_vectors(args.word_vectors)
                makedirs(os.path.dirname(args.vector_cache))
                torch.save(self.TEXT.vocab.vectors, args.vector_cache)

        if args.char_vectors and args.char:
            print('Found pretrained character embeddings')
            cache, name = '/'.join(args.char_vectors.split('/')
                                   [:-1]), args.char_vectors.split('/')[-1]
            char_vectors = Vectors(cache=cache, name=name)
            self.char_text.vocab.set_vectors(char_vectors.stoi,
                                             char_vectors.vectors,
                                             char_vectors.dim)
        self.LABEL.build_vocab(self.train)

        def sort_key(x):
            return data.interleave_keys(len(x.question), len(x.answer))

        self.train_iter, self.dev_iter, self.test_iter = data.BucketIterator.splits(
            (self.train, self.dev, self.test),
            batch_size=args.batch_size,
            device=args.device,
            sort_key=sort_key,
            sort_within_batch=False)
        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])

        self.submission_iter = data.BucketIterator(data_submission,
                                                   batch_size=args.batch_size,
                                                   device=args.device,
                                                   sort_within_batch=False)
Example #26
0
ApparelTEXT = data.Field(tokenize='spacy')
ApparelLABEL = data.LabelField(tensor_type=torch.FloatTensor)
print("loading dataset clean_Apparel300.tsv...")
Appareltrain  = data.TabularDataset.splits(
        path='../stanford-corenlp-full-2018-10-05/stanfordSentimentTreebank/', 
        train='mytrain2.tsv',
        format='tsv',
        fields=[('Text', ApparelTEXT),('Label', ApparelLABEL)])[0]
ApparelTEXT.build_vocab(Appareltrain, max_size=60000, vectors="glove.6B.300d",min_freq=1)
ApparelLABEL.build_vocab(Appareltrain)
for a,b in ApparelLABEL.vocab.stoi.items():
    ApparelLABEL.vocab.stoi[a]=float(a)
'''    

JewelryTEXT = data.Field(tokenize='spacy')
JewelryLABEL = data.LabelField(tensor_type=torch.FloatTensor)
print("loading dataset stanford-sentiment-treebank.train.tsv...")
Jewelrytrain  = data.TabularDataset.splits(
        path='../stanford-corenlp-full-2018-10-05/stanfordSentimentTreebank/', 
        train='stanford-sentiment-treebank.train.tsv',
        format='tsv',
        fields=[('Text', JewelryTEXT),('Label', JewelryLABEL)])[0]
JewelryTEXT.build_vocab(Jewelrytrain, max_size=60000, vectors="glove.6B.300d",min_freq=1)
JewelryLABEL.build_vocab(Jewelrytrain)
for a,b in JewelryLABEL.vocab.stoi.items():
    JewelryLABEL.vocab.stoi[a]=float(a)
    
ShoesTEXT = data.Field(tokenize='spacy')
ShoesLABEL = data.LabelField(tensor_type=torch.FloatTensor)
print("loading dataset stanford-sentiment-treebank.train.tsv...")
Shoestrain  = data.TabularDataset.splits(
Example #27
0
BATCH_SIZE = 256
LEARNING_RATE = 1e-3
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20
LAMBDA = 1e-3

####################################
#          Preparing Data          #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='<pad>', unk_token='<unk>')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()

# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(
    path=TrustPilot_processed_dataset_path,
    train="train.csv",
    test="test.csv",
    fields=[('text', TEXT), ('tag_label', TAG_LABEL), ('age_label', AGE_LABEL),
            ('gender_label', GENDER_LABEL)],
    format="csv")

# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
Example #28
0
from torchtext import data, datasets

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if __name__=='__main__':
    print('Using device:', device)

"""## Download dataset
First we will download the dataset using [torchtext](https://torchtext.readthedocs.io/en/latest/index.html), which is a package that supports NLP for PyTorch. The following command will get you 3 objects `train_data`, `val_data` and `test_data`. To access the data:

*   To access list of textual tokens - `train_data[0].text`
*   To access label - `train_data[0].label`
"""

if __name__=='__main__':
    train_data, val_data, test_data = datasets.SST.splits(data.Field(tokenize = 'spacy'), data.LabelField(dtype = torch.float), filter_pred=lambda ex: ex.label != 'neutral')

    print('{:d} train and {:d} test samples'.format(len(train_data), len(test_data)))

    print('Sample text:', train_data[0].text)
    print('Sample label:', train_data[0].label)

"""# 1. Define the Dataset Class (4 points)

In the following cell, we will define the dataset class. You need to implement the following functions: 


*   ` build_dictionary() ` - creates the dictionaries `ixtoword` and `wordtoix`. Converts all the text of all examples, in the form of text ids and stores them in `textual_ids`. If a word is not present in your dictionary, it should use `<unk>`. Use the hyperparameter `THRESHOLD` to control which words appear in the dictionary, based on their frequency in the training data. Note that a word’s frequency should be `>=THRESHOLD` to be included in the dictionary. Also make sure that `<end>` should be at idx 0, and `<unk>` should be at idx 1

*   ` get_label() ` - This function should return the value `1` if the label in the dataset is `positive`, and should return `0` if it is `negative`. The data type for the returned item should be `torch.LongTensor`
Example #29
0
from google.colab import drive
drive.mount('/content/drive/')

cd drive/My Drive/DL2/

cd DL2

import torch   

#handling text data
from torchtext import data  
import torch.optim as optim

tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
LABEL = data.LabelField()
# fields = [('label', LABEL), ('text',TEXT)]
fields = [(None, None), ('label', LABEL), ('text', TEXT)]
train_data=data.TabularDataset(path = 'p_training_data.csv',fields = fields, format = 'csv',skip_header = True)
valid_data = data.TabularDataset(path = 'p_validation_data.csv',fields = fields, format = 'csv',skip_header = True)
#print preprocessed text
print(vars(train_data.examples[0]))

SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True
Example #30
0
def build_field(stop_word):
    for ii in range(len(stop_word)):
        stop_word[ii] = str(stop_word[ii])
    text_field = data.Field(stop_words=stop_word)
    label_field = data.LabelField(use_vocab=False)
    return text_field, label_field