Ejemplo n.º 1
0
 def _create_namedfields(self):
     DE = NamedField(names=('srcSeqlen', ), tokenize=self.tokenize_de)
     EN = NamedField(names=('trgSeqlen', ),
                     tokenize=self.tokenize_en,
                     init_token=self.BOS_WORD,
                     eos_token=self.EOS_WORD)  # only target needs BOS/EOS
     return DE, EN
Ejemplo n.º 2
0
def load_csv_data(csv_file, device, random_state = 1, train_split = 0.8, batch_size = 10 ): 
	'''  Load CSV file of nucleotide sequences
	Args: 
		csv_file: Name of the csv file of nucleotide sequences to model
		random_state: Integer for random seed of test train split
		train_split: Fraction of training test (float 0 to 1)
		device : torch device

	Returns: 
		train_bucket_iterator, test_bucket_iterator, TEXT
	'''
	# Prepend input with a start token
	tokenize = lambda x : ["<START>"] + re.findall('.{%d}' % 3, x)
	TEXT = NamedField(names=("seqlen", ), sequential=True, 
						lower=True, tokenize=tokenize)

	my_data = torchtext.data.TabularDataset(csv_file, format="CSV", 
											fields=[("sequence", TEXT)])
	# Randomly seed then separate train test
	random.seed(random_state)
	train, test = my_data.split(split_ratio=train_split, random_state=random.getstate())
	# Remove random seed
	random.seed(None)
	# Build vocab
	TEXT.build_vocab(train)

	# Create bucket iterators
	train_iter_bucket, test_iter_bucket = torchtext.data.BucketIterator.splits(
		(train, test), batch_sizes=(batch_size,batch_size), sort_within_batch=False, 
			sort_key=lambda x : len(x.sequence),
		device=torch.device(device, ))

	return train_iter_bucket, test_iter_bucket, TEXT
Ejemplo n.º 3
0
def make_fields(maxlen=-1):
    ENT   = NamedField(names=("els",), lower=True, include_lengths=True)
    TYPE  = NamedField(names=("els",), lower=True, include_lengths=True)
    VALUE = NamedField(names=("els",), lower=True, include_lengths=True)
    VALUE_TEXT = NamedField(
        names = ("els",),
        lower=True, include_lengths=True, init_token=None, eos_token=None, is_target=True)
    TEXT  = NamedField(
        names = ("time",),
        lower=True, include_lengths=True,
        init_token="<bos>", eos_token="<eos>", is_target=True,
        fix_length = maxlen if maxlen > 0 else None,
    )
    return ENT, TYPE, VALUE, VALUE_TEXT, TEXT
Ejemplo n.º 4
0
def build_helper_tables(TEXT, device): 
	'''  Load CSV file of nucleotide sequences
	Args: 
		TEXT: torchtext field for the vocab of nucleotides
		device : torch device

	Returns: 
		AA_LABEL: torch text field for amino acids to index
		index_table: look up table s.t. you can index with codon index and receive one hot for AA
		codon_to_aa: dictionary to move from codon to amino acid string
		codon_to_aa_index: look up table s.t. you can index with codon index and receive AA index
		mask_tbl: Index with codon and get a mask table to add to the output of the model and get synonymous options
	'''

	AA_LABEL = NamedField(names=("seqlen", ), 
						lower=True)
	bases = "tcag"
	codons = [a + b + c for a in bases for b in bases for c in bases]
	aa = [str(Seq(j).translate()) for j in codons]
	# Mapping of codons to amino acids
	codon_to_aa = dict(zip(codons, aa))
	# One hot encoding of all possible amino acids
	AA_LABEL.build_vocab(aa)

	# Make a look up table, such that you can index with the vocab item (e.g. a codon)
	# and get the one hot corresponding to its amino acid
	one_hot_vec = torch.eye(len(AA_LABEL.vocab))
	zero_vec =  torch.zeros(len(AA_LABEL.vocab), 1)
	# Useful..
	direct_look_up = [one_hot_vec[AA_LABEL.vocab.stoi[codon_to_aa[TEXT.vocab.itos[i]]]].unsqueeze(1) 
						if TEXT.vocab.itos[i] in codon_to_aa else zero_vec
						for i in range(len(TEXT.vocab.stoi))]

	# Shape codon x one hot 
	index_table = torch.cat(direct_look_up, dim=1).t()
	codon_to_aa_index = torch.argmax(index_table, 1)

	# Build masking table
	# Here, if it's a synonymous option, give it 0 value, if not, give -1e9
	# Add this with the output vector (i.e. output += mask_tbl[trg]) before softmax
	mask_tbl = torch.tensor(np.array([[0 if (codon in codon_to_aa and codon_2 in codon_to_aa and codon_to_aa[codon] == codon_to_aa[codon_2]) else -1e9 
				 for codon_2 in TEXT.vocab.itos] 
				for index, codon in enumerate(TEXT.vocab.itos)])).to(device)

	# For ease, make sure padding gets predicted as padding...
	mask_tbl[1,1] = 0

	return (AA_LABEL, index_table, codon_to_aa, codon_to_aa_index, mask_tbl)
Ejemplo n.º 5
0
def make_fields(maxlen=-1, bert=False):
    ENT   = NamedField(names=("els",), lower=True, include_lengths=True)
    TYPE  = NamedField(names=("els",), lower=True, include_lengths=True)
    VALUE = NamedField(names=("els",), lower=True, include_lengths=True)
    VALUE_TEXT = NamedField(
        names = ("els",),
        lower=True, include_lengths=True, init_token=None, eos_token=None, is_target=True)
    TEXT  = NamedField(
        tokenize = BertTokenizer.from_pretrained('bert-base-uncased').tokenize
            if bert else None,
        names = ("time",),
        lower=True, include_lengths=True,
        init_token="<bos>", eos_token="<eos>", is_target=True,
        fix_length = maxlen if maxlen > 0 else None,
    )
    return ENT, TYPE, VALUE, VALUE_TEXT, TEXT
Ejemplo n.º 6
0
def load_text(path, debug=False, device="cpu"):
    # Our input $x$
    TEXT = NamedField(names=("seqlen", ))

    # Data distributed with the assignment
    train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
        path=path,
        train="train.txt",
        validation="valid.txt",
        test="valid.txt",
        text_field=TEXT,
    )
    # When debugging you may want to use a smaller vocab size. This will run much faster.

    if debug:
        TEXT.build_vocab(train, max_size=1000)
        len(TEXT.vocab)
    else:
        TEXT.build_vocab(train)

    train_iter, val_iter, test_iter = NamedBpttIterator.splits(
        (train, val, test),
        batch_size=10,
        device=device,
        bptt_len=32,
        repeat=False)

    return train_iter, val_iter, test_iter, TEXT
Ejemplo n.º 7
0
def load(device = 'cpu',
              pretrained_embedding = 'glove.6B.300d',
              embedding_dim = 300,
              embedding_num = 100,
              batch_size = 16):
    # Our input $x$
    TEXT = NamedField(names=('seqlen',))

    # Our labels $y$
    LABEL = NamedField(sequential=False, names=())

    # create train val test split
    train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL)

    # build vocabs
    TEXT.build_vocab(train)
    LABEL.build_vocab(train)

    # create iters
    train_iter, val_iter = torchtext.data.BucketIterator.splits(
    (train, val), batch_size=batch_size, device=torch.device(device), repeat=False)

    test_iter = torchtext.data.BucketIterator(test, train=False, batch_size=10, device=torch.device(device))


    # Build the vocabulary with word embeddings
    # Out-of-vocabulary (OOV) words are hashed to one of 100 random embeddings each
    # initialized to mean 0 and standarad deviation 1 (Sec 5.1)
    unk_vectors = [torch.randn(embedding_dim) for _ in range(embedding_num)]
    TEXT.vocab.load_vectors(vectors=pretrained_embedding, unk_init=lambda x:random.choice(unk_vectors))

    # normalized to have l_2 norm of 1
    vectors = TEXT.vocab.vectors
    vectors = vectors / vectors.norm(dim=1,keepdim=True)
    vectors = NamedTensor(vectors, ('word', 'embedding'))
    TEXT.vocab.vectors = vectors

    return train_iter, val_iter, test_iter, TEXT, LABEL
Ejemplo n.º 8
0
            for i in range(0, len(self) * self.bptt_len, self.bptt_len):
                self.iterations += 1
                seq_len = min(self.bptt_len, len(data) - i - 1)
                yield Batch.fromvars(
                    dataset,
                    self.batch_size,
                    text=data.narrow("seqlen", i, seq_len),
                    target=data.narrow("seqlen", i + 1, seq_len),
                )

            if not self.repeat:
                return


# Our input $x$
TEXT = NamedField(names=("seqlen", ))
# TEXT = torchtext.data.Field()  # PyTorch alt

GLOVE = False  # Use GloVe embeddings
DEBUG_MODE = False  # When debugging use smaller vocab size

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".",
    train="train.txt",
    validation="valid.txt",
    test="valid.txt",
    text_field=TEXT)

if GLOVE:
    TEXT.build_vocab(train, vectors='glove.6B.100d')
Ejemplo n.º 9
0
import torch
from torch import optim
# Text text processing library and methods for pretrained word embeddings
import torchtext
from torchtext.vocab import Vectors, GloVe

# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

device = torch.device("cpu")

# Our input $x$
TEXT = NamedField(names=('seqlen', ))

# Our labels $y$
LABEL = NamedField(sequential=False, names=(), unk_token=None)

train, val, test = torchtext.datasets.SST.splits(
    TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=10, device=device)

# TEXT.vocab.load_vectors()
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
Ejemplo n.º 10
0
from torchtext.vocab import Vectors, GloVe
from nltk.util import ngrams
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField


# Fields for processing
def tokenizer(text):
    words = [f.lower() for f in text.split(" ")]
    token = ["1" + i for i in words
             ] + ["2" + "".join(i) for i in ngrams(text.split(" "), 2)]
    #+ ["1" + i for i in text.split(" ")]
    return token


NGRAMS = NamedField(names=('ngramlen', ), sequential=True, tokenize=tokenizer)
LABEL = NamedField(sequential=False,
                   names=(),
                   unk_token=None,
                   dtype=torch.float)

# Load and split data into training sets
train, val, test = torchtext.datasets.SST.splits(
    NGRAMS, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

# Build vocab
NGRAM.build_vocab(train, min_freq=2)
LABEL.build_vocab(train)

# Set up batches for model input
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
Ejemplo n.º 11
0
from namedtensor.text import NamedField

import time
import numpy as np

from torchtext.data.iterator import BPTTIterator
from torchtext.data import Batch, Dataset
import math

bptt_len = 10
mode = 'nonstatic'
device = torch.device("cuda")
use_pretrained = True
batch_size = 256
# Our input $x$
TEXT = NamedField(names=("seqlen",))


# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".",
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

# TEXT.build_vocab(train)
# print('len(TEXT.vocab)', len(TEXT.vocab))

if use_pretrained:
    TEXT.build_vocab(train, vectors="glove.840B.300d")
    vocab_size, embed_size = TEXT.vocab.vectors.size()

else:
Ejemplo n.º 12
0
import torch
import torchtext
from torchtext.vocab import Vectors, GloVe

from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

import torch.optim as optim

import torch.nn as nn
import torch.nn.functional as F

# Fields for processing
TEXT = NamedField(names=('seqlen', ))
LABEL = NamedField(sequential=False, names=(), unk_token=None)

# Split data into train, validation, test
train, val, test = torchtext.datasets.SST.splits(
    TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

# Build vocab
TEXT.build_vocab(train, vectors='glove.6B.100d')
LABEL.build_vocab(train)

# Set up batches for model input
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=128, device=torch.device('cuda'))


class CNN(nn.Module):
    def __init__(
Ejemplo n.º 13
0
import math

import models.models as models

# parser
parser = argparse.ArgumentParser(description='CS_6741_HW_2')
parser.add_argument('--model_type',
                    default='',
                    type=str,
                    help='tg | nn | lstm')
parser.add_argument('--pretrained', default='', type=str, help='model path')
parser.add_argument('--em', action='store_true')
parser.add_argument('--analysis', action='store_true')
parser.add_argument('--use_word_vec', action='store_true')

TEXT = NamedField(names=("seqlen", ))

train_txt, val_txt, test_txt = torchtext.datasets.LanguageModelingDataset.splits(
    path=".",
    train="train.txt",
    validation="valid.txt",
    test="test.txt",
    text_field=TEXT)

TEXT.build_vocab(train_txt)
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec'
word_vec = TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))


class LMDataset(BPTTIterator):
    def __iter__(self):
Ejemplo n.º 14
0
from torchtext.data.iterator import BPTTIterator
from torchtext.data import Batch, Dataset

# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

# setting the default tensor type to `torch.cuda.FloatTensor`
# change this to `torch.FloatTensor` if you don't have a gpu
# torch.set_default_tensor_type(torch.FloatTensor)
torch.set_default_tensor_type(torch.FloatTensor)

DEBUG_MODE = False

# Our input $x$
TEXT = NamedField(names=('seqlen', ))

# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path='.',
    train='data/train.txt',
    validation='data/valid.txt',
    test='data/test.txt',
    text_field=TEXT)

# use a smaller vocab size when debugging
if not DEBUG_MODE:
    TEXT.build_vocab(train)
else:
    TEXT.build_vocab(train, max_size=1000)
Ejemplo n.º 15
0
import torch
from torch import optim
# Text text processing library and methods for pretrained word embeddings
import torchtext
from torchtext.vocab import Vectors, GloVe

# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

# Our input $x$
TEXT = NamedField(names=('seqlen', ))

# Our labels $y$
LABEL = NamedField(sequential=False, names=(), unk_token=None)

train, val, test = torchtext.datasets.SST.splits(
    TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

TEXT.build_vocab(train)
LABEL.build_vocab(train)

device = torch.device("cpu")

train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=10, device=device)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))
Ejemplo n.º 16
0
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen', ), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen', ),
                tokenize=tokenize_en,
                init_token=BOS_WORD,
                eos_token=EOS_WORD)  # only target needs BOS/EOS

MAX_LEN = 20
import dill
import pickle
try:
    train, val = pickle.load(open("saved_data.p", 'rb'))
    print(loaded)
except:
    print("could not load:")
    train, val, test = datasets.IWSLT.splits(
        exts=('.de', '.en'),
Ejemplo n.º 17
0
def main(args):
    # Our input $x$
    TEXT = NamedField(names=('seqlen', ))

    # Our labels $y$
    LABEL = NamedField(sequential=False, names=(), unk_token=None)

    train, val, test = torchtext.datasets.SST.splits(
        TEXT,
        LABEL,
        filter_pred=lambda ex: ex.label != 'neutral',
        train_subtrees=args.train_subtrees)

    TEXT.build_vocab(train)
    LABEL.build_vocab(train)
    vocab_size = len(TEXT.vocab.itos)
    num_classes = len(LABEL.vocab.itos)
    padding_idx = TEXT.vocab.stoi['<pad>']

    device = torch.device('cuda:%d' %
                          args.gpu) if args.gpu > -1 else torch.device('cpu')
    train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
        (train, val, test), batch_size=10, device=device, repeat=False)
    train_iter = torchtext.data.BucketIterator(train,
                                               batch_size=args.bsz,
                                               device=device,
                                               repeat=False,
                                               train=True)

    if args.model != 'NB':
        # Build the vocabulary with word embeddings
        url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
        print('loading word vecors from %s' % url)
        if not args.big_vec:
            TEXT.vocab.load_vectors(
                vectors=Vectors('wiki.simple.vec', url=url))
        else:
            TEXT.vocab.load_vectors(vectors=GloVe(name="840B"))

    # Build model
    print('Building model %s' % args.model)
    models = [NB, LR, CBoW, CNN]
    Model = list(filter(lambda x: x.__name__ == args.model, models))[0]
    model = Model(TEXT.vocab, num_classes, padding_idx)
    if args.gpu > -1:
        model.cuda(args.gpu)

    if args.model == 'NB':
        print('Counting frequencies')
        train_NB(model, train_iter)
        print('Validating')
        correct, total, accuracy = validate(model, val_iter)
        print('Validation Accuracy: %f' % (accuracy))
    else:
        params = [p for p in model.parameters() if p.requires_grad]
        optimizer = (torch.optim.SGD(params, lr=args.lr) if args.optim == "sgd"
                     else torch.optim.Adam(params, lr=args.lr))
        state = train_model(model, train_iter, val_iter, optimizer,
                            args.epochs)
        # Load best params based on val acc
        model.load_state_dict(state)

    print('Testing')
    correct, total, accuracy = validate(model, test_iter)
    print('Test Accuracy: %f' % (accuracy))

    if args.test_code:
        print('Writing predictions to predictions.txt')
        test_code(model, test)