def snli(percentage=None): """ Returns the SNLI dataset, splits included :param float percentage: the percentage of the data to use :returns: the SNLI dataset in splits :rtype: tuple """ train, dev, test = snli_dataset(data_dir, train=True, dev=True, test=True) if percentage: train = train[:np.int(np.ceil(len(train) * percentage))] dev = dev[:np.int(np.ceil(len(dev) * percentage))] test = test[:np.int(np.ceil(len(test) * percentage))] return train, dev, test
from torchnlp.samplers import BucketBatchSampler from torchnlp.datasets import snli_dataset from torchnlp.encoders.text import WhitespaceEncoder from torchnlp.encoders import LabelEncoder from torchnlp import word_to_vector from model import SNLIClassifier from util import get_args, makedirs, collate_fn args = get_args() if args.gpu >= 0: torch.cuda.set_device(args.gpu) # load dataset train, dev, test = snli_dataset(train=True, dev=True, test=True) # Preprocess for row in itertools.chain(train, dev, test): row['premise'] = row['premise'].lower() row['hypothesis'] = row['hypothesis'].lower() # Make Encoders sentence_corpus = [row['premise'] for row in itertools.chain(train, dev, test)] sentence_corpus += [ row['hypothesis'] for row in itertools.chain(train, dev, test) ] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in itertools.chain(train, dev, test)] label_encoder = LabelEncoder(label_corpus)
counter = Counter() for t in tqdm(dataset): premise = t['premise'] hypothesis = t['hypothesis'] premise_tokens = nltk.word_tokenize(premise) hypothesis_tokens = nltk.word_tokenize(hypothesis) tokens = premise_tokens + hypothesis_tokens counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Concept() vocab.add_word('<unk>') # Add the words to the vocabulary. for word in tqdm(words): for other in words: vec = get_vector(word, other) vocab.add_word(word, other, vec) return vocab if __name__ == '__main__': train_data = snli_dataset(train=True) vocab = prepare_vocab(train_data, 3)
import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchtext import data from torchtext import datasets word_to_ix={'entailment':0,'neutral':1,'contradiction':2,'-':3} ix_to_word={0:'entailment',1:'neutral',2:'contradiction',3:'-'} def map_to_ix(x): return word_to_ix[x] def map_to_word(x): return ix_to_word[x] #data_preprocessing train=pd.DataFrame(snli_dataset(train=True), columns=['premise','hypothesis','label']) train['label']=train['label'].apply(lambda x:map_to_ix(x)) val=pd.DataFrame(snli_dataset(dev=True), columns=['premise','hypothesis','label']) val['label']=val['label'].apply(lambda x:map_to_ix(x)) test=pd.DataFrame(snli_dataset(test=True), columns=['premise','hypothesis','label']) test['label']=test['label'].apply(lambda x:map_to_ix(x)) #TFID Logistic regression classifier text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression( penalty='l2', multi_class='auto',solver='saga', max_iter=100, tol=1e-3)), ])