def get_all_vectors(pretrained_model): emb_vectors = [] if pretrained_model == "": return emb_vectors emb_vector_names = pretrained_model.split(",") for emb_vector_name in emb_vector_names: emb_info = emb_vector_name.split("_") if len(emb_info) == 3: emb_name, emb_set, emb_size = emb_info[0], emb_info[1], emb_info[2] else: emb_name, emb_set = emb_info[0], emb_info[1] if emb_name == "glove": # glove_640B_300 print("glove") emb_vectors.append(GloVe(name=emb_set, dim=emb_size)) elif emb_name == "fasttext": if emb_set == "subwordcc": # fasttext_subwordcc print("fasttext_subwordcc") emb_vectors.append(FastTextSubwordCC()) elif emb_set == "wiki": # fasttext_wiki_en print("fasttext_wiki") emb_vectors.append(FastText(language=emb_size)) elif emb_set == "cc": # fasttext_cc_en print("fasttext_cc") emb_vectors.append(FastTextCC(language=emb_size)) elif emb_name == "char": # char_ngram if emb_set == "ngram": print("char_ngram") emb_vectors.append(CharNGram()) return emb_vectors
def main(params): # build dataset train_data = pd.read_csv('./data/train_final.csv') tokenizer = get_tokenizer('spacy', language='en') if params.emb_type == "GloVe": embedding = GloVe( name=params.emb_data, dim=params.emb_dim ) # use glove embedding with default option(name='840B', dim=300) elif params.emb_type == "CharNGram": embedding = CharNGram() elif params.emb_type == "FastText": embedding = FastText(name=params.emb_data, dim=params.emb_dim) else: print("Wrong embedding type") exit() train_data, val_data = train_data[1000:], train_data[:1000] train_dataset = SentimentDataset(train_data, tokenizer, embedding) val_dataset = SentimentDataset(val_data, tokenizer, embedding) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) model = SentimentClassificationModel(params.emb_dim, params.hidden_dim, params.dropout).to(device) crit = nn.CrossEntropyLoss().to(device) optim = torch.optim.Adam(params=model.parameters(), lr=1e-3) best_val_acc = 0 early_stop_cnt = 0 epoch = 0 train_loss_list = [] train_acc_list = [] val_acc_list = [] while early_stop_cnt != 5: loss_list, train_acc = train.trainer(epoch, model, train_dataloader, crit, optim, device) val_acc = train.eval(epoch, model, val_dataloader, device, False) if val_acc > best_val_acc and epoch > 0: torch.save(model.state_dict(), './model/lstm_best.pt') best_val_acc = val_acc early_stop_cnt = 0 early_stop_cnt += 1 epoch += 1 train_loss_list.extend(loss_list) train_acc_list.append(train_acc) val_acc_list.append(val_acc) print("Early stopping condition satisfied") plotting("train_loss", "steps", "loss", train_loss_list) plotting("train_accuracy", "epoch", "accuracy", train_acc_list) plotting('validation_accuracy', "epoch", "accuracy", val_acc_list)
def get_vectors(self, vocab): sources = None if self.wordvec_source == 'glove': sources = ['GloVe'] elif self.wordvec_source == 'charlevel': sources = ['GloVe', 'charLevel'] elif self.wordvec_source == 'google': sources = ['googlenews'] elif self.wordvec_source == 'gigavec': sources = ['gigavec'] else: sources = [] print('Building Vocab...') if isinstance(vocab, Vocab): print("Using Pretrained Vocab") self.sentence_field.vocab = vocab print(len(self.sentence_field.vocab.itos)) else: print('wrong') vecs = [] print('Loading Vectors From Memory...') if self.pretrained_vecs: print('Using these vectors: ' + str(self.wordvec_source)) for source in sources: if source == 'GloVe': glove = Vectors(name='glove.6B.{}d.txt'.format( self.glove_dim), cache=self.vector_cache) vecs.append(glove) self.wordvec_dim += self.glove_dim if source == 'charLevel': charVec = CharNGram() self.wordvec_dim += 100 if source == 'googlenews': googlenews = Vectors(name = 'googlenews.txt',\ cache = self.vector_cache) vecs.append(googlenews) self.wordvec_dim += 300 if source == 'gigavec': gigavec = Vectors(name = 'gigamodel.vec',\ cache = self.vector_cache) vecs.append(gigavec) self.wordvec_dim += 300 if isinstance(vocab, Counter): self.sentence_field.vocab = Vocab(vocab, vectors=vecs, max_size=self.max_vocab) else: self.sentence_field.build_vocab(self.train_sentences, vectors = vecs, \ max_size = self.max_vocab, min_freq = MIN_FREQ) print('Found {} tokens'.format(len(self.sentence_field.vocab))) if self.tie_weights: self.hidden_size = self.wordvec_dim
def get_vectors(self): if self.glove: print('Downloading GloVe Vectors...') glove = GloVe(name='6B', cache='vectors') print('Done.') if self.charngram: print('Downloading CharNGram Vectors...') charVec = CharNGram(cache='vectors') print('Done.')
def test_vocab_download_charngram_vectors(self): c = Counter({ 'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2 }) # Build a vocab and get vectors twice to test caching, then once more # to test string aliases. for i in range(3): if i == 2: vectors = "charngram.100d" else: vectors = CharNGram() v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=vectors) expected_itos = [ '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world' ] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_charngram = { 'hello': [ -0.44782442, -0.08937783, -0.34227219, -0.16233221, -0.39343098 ], 'world': [ -0.29590717, -0.05275926, -0.37334684, 0.27117205, -0.3868292 ], } for word in expected_charngram: assert_allclose(vectors[v.stoi[word], :5], expected_charngram[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(100)) assert_allclose(vectors[v.stoi['OOV token']], np.zeros(100)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": conditional_remove( os.path.join(self.project_root, ".vector_cache", "charNgram.txt")) conditional_remove( os.path.join(self.project_root, ".vector_cache", "jmt_pre-trained_embeddings.tar.gz"))
print('train.fields', train.fields) print('len(train)', len(train)) print('vars(train[0])', vars(train[0])) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) # make iterator for splits train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label) # Approach 2: TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()]) LABEL.build_vocab(train) train_iter, test_iter = datasets.TREC.iters(batch_size=4) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label)
print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) # make iterator for splits train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=3) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label) # Approach 2: TEXT.build_vocab( train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) LABEL.build_vocab(train) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label) # Approach 3:
def nyt_ingredients_ner_dataset(batch_size, use_local=False, root='.data/nyt_ingredients_ner', train_file='train.txt', validation_file='valid.txt', test_file='test.txt', convert_digits=True): """ nyt_ingredients_ner: New York Times Ingredient tagging dataset Extract NYT ingredients dataset using torchtext. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: batch_size: Batch size to return from iterator use_local: If True use local provided files (default False) root: Dataset root directory train_file: Train filename validation_file: Validation filename test_file: Test filename convert_digits: If True will convert numbers to single 0's Returns: A dict containing: task: 'nyt_ingredients.ner' iters: (train iter, validation iter, test iter) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field( init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: '0' if convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), ('labels', labels)]) # Load the data if use_local: train, val, test = SequenceTaggingDataset.splits( path=root, train=train_file, validation=validation_file, test=test_file, fields=tuple(fields)) else: train, val, test = Ingredients.splits(fields=tuple(fields)) logger.info('---------- NYT INGREDIENTS NER ---------') logger.info('Train size: %d' % (len(train))) logger.info('Validation size: %d' % (len(val))) logger.info('Test size: %d' % (len(test))) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) logger.info('Input vocab size:%d' % (len(inputs_word.vocab))) logger.info('Tagset size: %d' % (len(labels.vocab))) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'nyt_ingredients.ner', 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }
def conll2003_dataset(tag_type, batch_size, root='./conll2003', train_file='eng.train.txt', validation_file='eng.testa.txt', test_file='eng.testb.txt', convert_digits=True): """ conll2003: Conll 2003 (Parser only. You must place the files) Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: tag_type: Type of tag to pick as task [pos, chunk, ner] batch_size: Batch size to return from iterator root: Dataset root directory train_file: Train filename validation_file: Validation filename test_file: Test filename convert_digits: If True will convert numbers to single 0's Returns: A dict containing: task: 'conll2003.' + tag_type iters: (train iter, validation iter, test iter) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline( lambda w: '0' if convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] + [('labels', labels) if label == tag_type else (None, None) for label in ['pos', 'chunk', 'ner']]) # Load the data train, val, test = SequenceTaggingDataset.splits( path=root, train=train_file, validation=validation_file, test=test_file, separator=' ', fields=tuple(fields)) logger.info('---------- CONLL 2003 %s ---------' % tag_type) logger.info('Train size: %d' % (len(train))) logger.info('Validation size: %d' % (len(val))) logger.info('Test size: %d' % (len(test))) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) logger.info('Input vocab size:%d' % (len(inputs_word.vocab))) logger.info('Tagset size: %d' % (len(labels.vocab))) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'conll2003.%s' % tag_type, 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }
# build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) # make iterator for splits train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=3, device=device) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label) # Approach 2: TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram()], device=device) LABEL.build_vocab(train) train_iter, test_iter = datasets.TREC.iters(batch_size=4) # print batch information batch = next(iter(train_iter)) print(batch.text) print(batch.label)
def conll2000_dataset(batch_size, use_local=False, root='.data/conll2000', train_file='train.txt', test_file='test.txt', validation_frac=0.1, convert_digits=True): """ conll2000: Conll 2000 (Chunking) Extract Conll2000 Chunking dataset using torchtext. By default will fetch data files from online repository. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: batch_size: Batch size to return from iterator use_local: If True use local provided files (default False) root (optional): Dataset root directory (needed only if use_local is True) train_file (optional): Train filename (needed only if use_local is True) test_file (optional): Test filename (needed only if use_local is True) validation_frac (optional): Fraction of train dataset to use for validation convert_digits (optional): If True will convert numbers to single 0's NOTE: Since there is only a train and test set we use 10% of the train set as validation Returns: A dict containing: task: 'conll2000.' + tag_type iters: (train iter, validation iter, None) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field( init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: '0' if convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), (None, None), ('labels', labels)] if use_local: # Load the data train, test = SequenceTaggingDataset.splits(path=root, train=train_file, test=test_file, fields=tuple(fields)) # HACK: Saving the sort key function as the split() call removes it sort_key = train.sort_key # To make the split deterministic random.seed(0) train, val = train.split(1 - validation_frac, random_state=random.getstate()) # Reset the seed random.seed() # HACK: Set the sort key train.sort_key = sort_key val.sort_key = sort_key else: train, val, test = CoNLL2000Chunking.splits( fields=tuple(fields), validation_frac=validation_frac) logger.info('---------- CONLL 2000 Chunking ---------') logger.info('Train size: %d' % (len(train))) logger.info('Validation size: %d' % (len(val))) logger.info('Test size: %d' % (len(test))) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) logger.info('Input vocab size:%d' % (len(inputs_word.vocab))) logger.info('Tagset size: %d' % (len(labels.vocab))) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'conll2000.chunk', 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }