def _create_namedfields(self): DE = NamedField(names=('srcSeqlen', ), tokenize=self.tokenize_de) EN = NamedField(names=('trgSeqlen', ), tokenize=self.tokenize_en, init_token=self.BOS_WORD, eos_token=self.EOS_WORD) # only target needs BOS/EOS return DE, EN
def load_csv_data(csv_file, device, random_state = 1, train_split = 0.8, batch_size = 10 ): ''' Load CSV file of nucleotide sequences Args: csv_file: Name of the csv file of nucleotide sequences to model random_state: Integer for random seed of test train split train_split: Fraction of training test (float 0 to 1) device : torch device Returns: train_bucket_iterator, test_bucket_iterator, TEXT ''' # Prepend input with a start token tokenize = lambda x : ["<START>"] + re.findall('.{%d}' % 3, x) TEXT = NamedField(names=("seqlen", ), sequential=True, lower=True, tokenize=tokenize) my_data = torchtext.data.TabularDataset(csv_file, format="CSV", fields=[("sequence", TEXT)]) # Randomly seed then separate train test random.seed(random_state) train, test = my_data.split(split_ratio=train_split, random_state=random.getstate()) # Remove random seed random.seed(None) # Build vocab TEXT.build_vocab(train) # Create bucket iterators train_iter_bucket, test_iter_bucket = torchtext.data.BucketIterator.splits( (train, test), batch_sizes=(batch_size,batch_size), sort_within_batch=False, sort_key=lambda x : len(x.sequence), device=torch.device(device, )) return train_iter_bucket, test_iter_bucket, TEXT
def make_fields(maxlen=-1): ENT = NamedField(names=("els",), lower=True, include_lengths=True) TYPE = NamedField(names=("els",), lower=True, include_lengths=True) VALUE = NamedField(names=("els",), lower=True, include_lengths=True) VALUE_TEXT = NamedField( names = ("els",), lower=True, include_lengths=True, init_token=None, eos_token=None, is_target=True) TEXT = NamedField( names = ("time",), lower=True, include_lengths=True, init_token="<bos>", eos_token="<eos>", is_target=True, fix_length = maxlen if maxlen > 0 else None, ) return ENT, TYPE, VALUE, VALUE_TEXT, TEXT
def build_helper_tables(TEXT, device): ''' Load CSV file of nucleotide sequences Args: TEXT: torchtext field for the vocab of nucleotides device : torch device Returns: AA_LABEL: torch text field for amino acids to index index_table: look up table s.t. you can index with codon index and receive one hot for AA codon_to_aa: dictionary to move from codon to amino acid string codon_to_aa_index: look up table s.t. you can index with codon index and receive AA index mask_tbl: Index with codon and get a mask table to add to the output of the model and get synonymous options ''' AA_LABEL = NamedField(names=("seqlen", ), lower=True) bases = "tcag" codons = [a + b + c for a in bases for b in bases for c in bases] aa = [str(Seq(j).translate()) for j in codons] # Mapping of codons to amino acids codon_to_aa = dict(zip(codons, aa)) # One hot encoding of all possible amino acids AA_LABEL.build_vocab(aa) # Make a look up table, such that you can index with the vocab item (e.g. a codon) # and get the one hot corresponding to its amino acid one_hot_vec = torch.eye(len(AA_LABEL.vocab)) zero_vec = torch.zeros(len(AA_LABEL.vocab), 1) # Useful.. direct_look_up = [one_hot_vec[AA_LABEL.vocab.stoi[codon_to_aa[TEXT.vocab.itos[i]]]].unsqueeze(1) if TEXT.vocab.itos[i] in codon_to_aa else zero_vec for i in range(len(TEXT.vocab.stoi))] # Shape codon x one hot index_table = torch.cat(direct_look_up, dim=1).t() codon_to_aa_index = torch.argmax(index_table, 1) # Build masking table # Here, if it's a synonymous option, give it 0 value, if not, give -1e9 # Add this with the output vector (i.e. output += mask_tbl[trg]) before softmax mask_tbl = torch.tensor(np.array([[0 if (codon in codon_to_aa and codon_2 in codon_to_aa and codon_to_aa[codon] == codon_to_aa[codon_2]) else -1e9 for codon_2 in TEXT.vocab.itos] for index, codon in enumerate(TEXT.vocab.itos)])).to(device) # For ease, make sure padding gets predicted as padding... mask_tbl[1,1] = 0 return (AA_LABEL, index_table, codon_to_aa, codon_to_aa_index, mask_tbl)
def make_fields(maxlen=-1, bert=False): ENT = NamedField(names=("els",), lower=True, include_lengths=True) TYPE = NamedField(names=("els",), lower=True, include_lengths=True) VALUE = NamedField(names=("els",), lower=True, include_lengths=True) VALUE_TEXT = NamedField( names = ("els",), lower=True, include_lengths=True, init_token=None, eos_token=None, is_target=True) TEXT = NamedField( tokenize = BertTokenizer.from_pretrained('bert-base-uncased').tokenize if bert else None, names = ("time",), lower=True, include_lengths=True, init_token="<bos>", eos_token="<eos>", is_target=True, fix_length = maxlen if maxlen > 0 else None, ) return ENT, TYPE, VALUE, VALUE_TEXT, TEXT
def load_text(path, debug=False, device="cpu"): # Our input $x$ TEXT = NamedField(names=("seqlen", )) # Data distributed with the assignment train, val, test = torchtext.datasets.LanguageModelingDataset.splits( path=path, train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT, ) # When debugging you may want to use a smaller vocab size. This will run much faster. if debug: TEXT.build_vocab(train, max_size=1000) len(TEXT.vocab) else: TEXT.build_vocab(train) train_iter, val_iter, test_iter = NamedBpttIterator.splits( (train, val, test), batch_size=10, device=device, bptt_len=32, repeat=False) return train_iter, val_iter, test_iter, TEXT
def load(device = 'cpu', pretrained_embedding = 'glove.6B.300d', embedding_dim = 300, embedding_num = 100, batch_size = 16): # Our input $x$ TEXT = NamedField(names=('seqlen',)) # Our labels $y$ LABEL = NamedField(sequential=False, names=()) # create train val test split train, val, test = torchtext.datasets.SNLI.splits(TEXT, LABEL) # build vocabs TEXT.build_vocab(train) LABEL.build_vocab(train) # create iters train_iter, val_iter = torchtext.data.BucketIterator.splits( (train, val), batch_size=batch_size, device=torch.device(device), repeat=False) test_iter = torchtext.data.BucketIterator(test, train=False, batch_size=10, device=torch.device(device)) # Build the vocabulary with word embeddings # Out-of-vocabulary (OOV) words are hashed to one of 100 random embeddings each # initialized to mean 0 and standarad deviation 1 (Sec 5.1) unk_vectors = [torch.randn(embedding_dim) for _ in range(embedding_num)] TEXT.vocab.load_vectors(vectors=pretrained_embedding, unk_init=lambda x:random.choice(unk_vectors)) # normalized to have l_2 norm of 1 vectors = TEXT.vocab.vectors vectors = vectors / vectors.norm(dim=1,keepdim=True) vectors = NamedTensor(vectors, ('word', 'embedding')) TEXT.vocab.vectors = vectors return train_iter, val_iter, test_iter, TEXT, LABEL
for i in range(0, len(self) * self.bptt_len, self.bptt_len): self.iterations += 1 seq_len = min(self.bptt_len, len(data) - i - 1) yield Batch.fromvars( dataset, self.batch_size, text=data.narrow("seqlen", i, seq_len), target=data.narrow("seqlen", i + 1, seq_len), ) if not self.repeat: return # Our input $x$ TEXT = NamedField(names=("seqlen", )) # TEXT = torchtext.data.Field() # PyTorch alt GLOVE = False # Use GloVe embeddings DEBUG_MODE = False # When debugging use smaller vocab size # Data distributed with the assignment train, val, test = torchtext.datasets.LanguageModelingDataset.splits( path=".", train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT) if GLOVE: TEXT.build_vocab(train, vectors='glove.6B.100d')
import torch from torch import optim # Text text processing library and methods for pretrained word embeddings import torchtext from torchtext.vocab import Vectors, GloVe # Named Tensor wrappers from namedtensor import ntorch, NamedTensor from namedtensor.text import NamedField device = torch.device("cpu") # Our input $x$ TEXT = NamedField(names=('seqlen', )) # Our labels $y$ LABEL = NamedField(sequential=False, names=(), unk_token=None) train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=10, device=device) # TEXT.vocab.load_vectors() TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d") LABEL.build_vocab(train) # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
from torchtext.vocab import Vectors, GloVe from nltk.util import ngrams from namedtensor import ntorch, NamedTensor from namedtensor.text import NamedField # Fields for processing def tokenizer(text): words = [f.lower() for f in text.split(" ")] token = ["1" + i for i in words ] + ["2" + "".join(i) for i in ngrams(text.split(" "), 2)] #+ ["1" + i for i in text.split(" ")] return token NGRAMS = NamedField(names=('ngramlen', ), sequential=True, tokenize=tokenizer) LABEL = NamedField(sequential=False, names=(), unk_token=None, dtype=torch.float) # Load and split data into training sets train, val, test = torchtext.datasets.SST.splits( NGRAMS, LABEL, filter_pred=lambda ex: ex.label != 'neutral') # Build vocab NGRAM.build_vocab(train, min_freq=2) LABEL.build_vocab(train) # Set up batches for model input train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
from namedtensor.text import NamedField import time import numpy as np from torchtext.data.iterator import BPTTIterator from torchtext.data import Batch, Dataset import math bptt_len = 10 mode = 'nonstatic' device = torch.device("cuda") use_pretrained = True batch_size = 256 # Our input $x$ TEXT = NamedField(names=("seqlen",)) # Data distributed with the assignment train, val, test = torchtext.datasets.LanguageModelingDataset.splits( path=".", train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT) # TEXT.build_vocab(train) # print('len(TEXT.vocab)', len(TEXT.vocab)) if use_pretrained: TEXT.build_vocab(train, vectors="glove.840B.300d") vocab_size, embed_size = TEXT.vocab.vectors.size() else:
import torch import torchtext from torchtext.vocab import Vectors, GloVe from namedtensor import ntorch, NamedTensor from namedtensor.text import NamedField import torch.optim as optim import torch.nn as nn import torch.nn.functional as F # Fields for processing TEXT = NamedField(names=('seqlen', )) LABEL = NamedField(sequential=False, names=(), unk_token=None) # Split data into train, validation, test train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') # Build vocab TEXT.build_vocab(train, vectors='glove.6B.100d') LABEL.build_vocab(train) # Set up batches for model input train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=128, device=torch.device('cuda')) class CNN(nn.Module): def __init__(
import math import models.models as models # parser parser = argparse.ArgumentParser(description='CS_6741_HW_2') parser.add_argument('--model_type', default='', type=str, help='tg | nn | lstm') parser.add_argument('--pretrained', default='', type=str, help='model path') parser.add_argument('--em', action='store_true') parser.add_argument('--analysis', action='store_true') parser.add_argument('--use_word_vec', action='store_true') TEXT = NamedField(names=("seqlen", )) train_txt, val_txt, test_txt = torchtext.datasets.LanguageModelingDataset.splits( path=".", train="train.txt", validation="valid.txt", test="test.txt", text_field=TEXT) TEXT.build_vocab(train_txt) url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec' word_vec = TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) class LMDataset(BPTTIterator): def __iter__(self):
from torchtext.data.iterator import BPTTIterator from torchtext.data import Batch, Dataset # Named Tensor wrappers from namedtensor import ntorch, NamedTensor from namedtensor.text import NamedField # setting the default tensor type to `torch.cuda.FloatTensor` # change this to `torch.FloatTensor` if you don't have a gpu # torch.set_default_tensor_type(torch.FloatTensor) torch.set_default_tensor_type(torch.FloatTensor) DEBUG_MODE = False # Our input $x$ TEXT = NamedField(names=('seqlen', )) # Data distributed with the assignment train, val, test = torchtext.datasets.LanguageModelingDataset.splits( path='.', train='data/train.txt', validation='data/valid.txt', test='data/test.txt', text_field=TEXT) # use a smaller vocab size when debugging if not DEBUG_MODE: TEXT.build_vocab(train) else: TEXT.build_vocab(train, max_size=1000)
import torch from torch import optim # Text text processing library and methods for pretrained word embeddings import torchtext from torchtext.vocab import Vectors, GloVe # Named Tensor wrappers from namedtensor import ntorch, NamedTensor from namedtensor.text import NamedField # Our input $x$ TEXT = NamedField(names=('seqlen', )) # Our labels $y$ LABEL = NamedField(sequential=False, names=(), unk_token=None) train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') TEXT.build_vocab(train) LABEL.build_vocab(train) device = torch.device("cpu") train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=10, device=device) # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))
spacy_de = spacy.load('de') spacy_en = spacy.load('en') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] BOS_WORD = '<s>' EOS_WORD = '</s>' DE = NamedField(names=('srcSeqlen', ), tokenize=tokenize_de) EN = NamedField(names=('trgSeqlen', ), tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD) # only target needs BOS/EOS MAX_LEN = 20 import dill import pickle try: train, val = pickle.load(open("saved_data.p", 'rb')) print(loaded) except: print("could not load:") train, val, test = datasets.IWSLT.splits( exts=('.de', '.en'),
def main(args): # Our input $x$ TEXT = NamedField(names=('seqlen', )) # Our labels $y$ LABEL = NamedField(sequential=False, names=(), unk_token=None) train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral', train_subtrees=args.train_subtrees) TEXT.build_vocab(train) LABEL.build_vocab(train) vocab_size = len(TEXT.vocab.itos) num_classes = len(LABEL.vocab.itos) padding_idx = TEXT.vocab.stoi['<pad>'] device = torch.device('cuda:%d' % args.gpu) if args.gpu > -1 else torch.device('cpu') train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=10, device=device, repeat=False) train_iter = torchtext.data.BucketIterator(train, batch_size=args.bsz, device=device, repeat=False, train=True) if args.model != 'NB': # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' print('loading word vecors from %s' % url) if not args.big_vec: TEXT.vocab.load_vectors( vectors=Vectors('wiki.simple.vec', url=url)) else: TEXT.vocab.load_vectors(vectors=GloVe(name="840B")) # Build model print('Building model %s' % args.model) models = [NB, LR, CBoW, CNN] Model = list(filter(lambda x: x.__name__ == args.model, models))[0] model = Model(TEXT.vocab, num_classes, padding_idx) if args.gpu > -1: model.cuda(args.gpu) if args.model == 'NB': print('Counting frequencies') train_NB(model, train_iter) print('Validating') correct, total, accuracy = validate(model, val_iter) print('Validation Accuracy: %f' % (accuracy)) else: params = [p for p in model.parameters() if p.requires_grad] optimizer = (torch.optim.SGD(params, lr=args.lr) if args.optim == "sgd" else torch.optim.Adam(params, lr=args.lr)) state = train_model(model, train_iter, val_iter, optimizer, args.epochs) # Load best params based on val acc model.load_state_dict(state) print('Testing') correct, total, accuracy = validate(model, test_iter) print('Test Accuracy: %f' % (accuracy)) if args.test_code: print('Writing predictions to predictions.txt') test_code(model, test)