Beispiel #1
0
def snli(percentage=None):
    """
    Returns the SNLI dataset, splits included

    :param float percentage: the percentage of the data to use
    :returns: the SNLI dataset in splits
    :rtype: tuple
    """
    train, dev, test = snli_dataset(data_dir, train=True, dev=True, test=True)

    if percentage:
        train = train[:np.int(np.ceil(len(train) * percentage))]
        dev = dev[:np.int(np.ceil(len(dev) * percentage))]
        test = test[:np.int(np.ceil(len(test) * percentage))]

    return train, dev, test
Beispiel #2
0
from torchnlp.samplers import BucketBatchSampler
from torchnlp.datasets import snli_dataset
from torchnlp.encoders.text import WhitespaceEncoder
from torchnlp.encoders import LabelEncoder
from torchnlp import word_to_vector

from model import SNLIClassifier
from util import get_args, makedirs, collate_fn

args = get_args()

if args.gpu >= 0:
    torch.cuda.set_device(args.gpu)

# load dataset
train, dev, test = snli_dataset(train=True, dev=True, test=True)

# Preprocess
for row in itertools.chain(train, dev, test):
    row['premise'] = row['premise'].lower()
    row['hypothesis'] = row['hypothesis'].lower()

# Make Encoders
sentence_corpus = [row['premise'] for row in itertools.chain(train, dev, test)]
sentence_corpus += [
    row['hypothesis'] for row in itertools.chain(train, dev, test)
]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in itertools.chain(train, dev, test)]
label_encoder = LabelEncoder(label_corpus)
Beispiel #3
0
    counter = Counter()

    for t in tqdm(dataset):

        premise = t['premise']
        hypothesis = t['hypothesis']
        premise_tokens = nltk.word_tokenize(premise)
        hypothesis_tokens = nltk.word_tokenize(hypothesis)
        tokens = premise_tokens + hypothesis_tokens
        counter.update(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Concept()
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for word in tqdm(words):
        for other in words:
            vec = get_vector(word, other)
            vocab.add_word(word, other, vec)

    return vocab


if __name__ == '__main__':
    train_data = snli_dataset(train=True)
    vocab = prepare_vocab(train_data, 3)
Beispiel #4
0
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import data
from torchtext import datasets

word_to_ix={'entailment':0,'neutral':1,'contradiction':2,'-':3}
ix_to_word={0:'entailment',1:'neutral',2:'contradiction',3:'-'}
def map_to_ix(x):
  return word_to_ix[x]
def map_to_word(x):
  return ix_to_word[x]

#data_preprocessing
train=pd.DataFrame(snli_dataset(train=True), columns=['premise','hypothesis','label'])
train['label']=train['label'].apply(lambda x:map_to_ix(x))
val=pd.DataFrame(snli_dataset(dev=True), columns=['premise','hypothesis','label'])
val['label']=val['label'].apply(lambda x:map_to_ix(x))
test=pd.DataFrame(snli_dataset(test=True), columns=['premise','hypothesis','label'])
test['label']=test['label'].apply(lambda x:map_to_ix(x))

#TFID Logistic regression classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression( penalty='l2',
                                multi_class='auto',solver='saga',
                                max_iter=100, tol=1e-3)),
     ])